From bd1a5c849bdcc5c89e4a6a18216cd2b9a7a8a78f Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Thu, 13 Aug 2009 16:34:53 -0400
Subject: tracing: Ftrace dynamic ftrace_event_call support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add dynamic ftrace_event_call support to ftrace. Trace engines can add
new ftrace_event_call to ftrace on the fly. Each operator function of
the call takes an ftrace_event_call data structure as an argument,
because these functions may be shared among several ftrace_event_calls.

Changes from v13:
 - Define remove_subsystem_dir() always (revirt a2ca5e03), because
   trace_remove_event_call() uses it.
 - Modify syscall tracer because of ftrace_event_call change.

[fweisbec@gmail.com: Fixed conflict against latest tracing/core]

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: K.Prasad <prasad@linux.vnet.ibm.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Przemysław Pawełczyk <przemyslaw@pawelczyk.it>
Cc: Roland McGrath <roland@redhat.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
LKML-Reference: <20090813203453.31965.71901.stgit@localhost.localdomain>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/linux/ftrace_event.h | 19 ++++++++++---------
 include/linux/syscalls.h     |  4 ++--
 2 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index ace2da9e0a0d..1ab3089b5c59 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -112,12 +112,12 @@ struct ftrace_event_call {
 	struct dentry		*dir;
 	struct trace_event	*event;
 	int			enabled;
-	int			(*regfunc)(void *);
-	void			(*unregfunc)(void *);
+	int			(*regfunc)(struct ftrace_event_call *);
+	void			(*unregfunc)(struct ftrace_event_call *);
 	int			id;
-	int			(*raw_init)(void);
-	int			(*show_format)(struct ftrace_event_call *call,
-					       struct trace_seq *s);
+	int			(*raw_init)(struct ftrace_event_call *);
+	int			(*show_format)(struct ftrace_event_call *,
+					       struct trace_seq *);
 	int			(*define_fields)(struct ftrace_event_call *);
 	struct list_head	fields;
 	int			filter_active;
@@ -147,11 +147,12 @@ enum {
 	FILTER_PTR_STRING,
 };
 
-extern int trace_define_field(struct ftrace_event_call *call,
-			      const char *type, const char *name,
-			      int offset, int size, int is_signed,
-			      int filter_type);
 extern int trace_define_common_fields(struct ftrace_event_call *call);
+extern int trace_define_field(struct ftrace_event_call *call, char *type,
+			      char *name, int offset, int size, int is_signed,
+			      int filter_type);
+extern int trace_add_event_call(struct ftrace_event_call *call);
+extern void trace_remove_event_call(struct ftrace_event_call *call);
 
 #define is_signed_type(type)	(((type)(-1)) < 0)
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index f124c8995555..646102eeff92 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -165,7 +165,7 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *event_call) \
 	struct trace_event enter_syscall_print_##sname = {		\
 		.trace                  = print_syscall_enter,		\
 	};								\
-	static int init_enter_##sname(void)				\
+	static int init_enter_##sname(struct ftrace_event_call *call)	\
 	{								\
 		int num, id;						\
 		num = syscall_name_to_nr("sys"#sname);			\
@@ -202,7 +202,7 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *event_call) \
 	struct trace_event exit_syscall_print_##sname = {		\
 		.trace                  = print_syscall_exit,		\
 	};								\
-	static int init_exit_##sname(void)				\
+	static int init_exit_##sname(struct ftrace_event_call *call)	\
 	{								\
 		int num, id;						\
 		num = syscall_name_to_nr("sys"#sname);			\
-- 
cgit v1.2.3


From 24851d2447830e6cba4c4b641cb73e713f312373 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 26 Aug 2009 23:38:30 +0200
Subject: tracing/kprobes: Dump the culprit kprobe in case of kprobe recursion

Kprobes can enter into a probing recursion, ie: a kprobe that does an
endless loop because one of its core mechanism function used during
probing is also probed itself.

This patch helps pinpointing the kprobe that raised such recursion
by dumping it and raising a BUG instead of a warning (we also disarm
the kprobe to try avoiding recursion in BUG itself). Having a BUG
instead of a warning stops the stacktrace in the right place and
doesn't pollute the logs with hundreds of traces that eventually end
up in a stack overflow.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
---
 arch/x86/kernel/kprobes.c | 8 ++++++--
 include/linux/kprobes.h   | 2 ++
 kernel/kprobes.c          | 7 +++++++
 3 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 16ae9610f6ff..ecee3d23fef8 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -490,9 +490,13 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
 			/* A probe has been hit in the codepath leading up
 			 * to, or just after, single-stepping of a probed
 			 * instruction. This entire codepath should strictly
-			 * reside in .kprobes.text section. Raise a warning
-			 * to highlight this peculiar case.
+			 * reside in .kprobes.text section.
+			 * Raise a BUG or we'll continue in an endless
+			 * reentering loop and eventually a stack overflow.
 			 */
+			arch_disarm_kprobe(p);
+			dump_kprobe(p);
+			BUG();
 		}
 	default:
 		/* impossible cases */
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index bcd9c07848be..87eb79c9dd60 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -296,6 +296,8 @@ void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
 int disable_kprobe(struct kprobe *kp);
 int enable_kprobe(struct kprobe *kp);
 
+void dump_kprobe(struct kprobe *kp);
+
 #else /* !CONFIG_KPROBES: */
 
 static inline int kprobes_built_in(void)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ef177d653b2c..f72e96c25a38 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1141,6 +1141,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
 	arch_remove_kprobe(p);
 }
 
+void __kprobes dump_kprobe(struct kprobe *kp)
+{
+	printk(KERN_WARNING "Dumping kprobe:\n");
+	printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n",
+	       kp->symbol_name, kp->addr, kp->offset);
+}
+
 /* Module notifier call back, checking kprobes on the module */
 static int __kprobes kprobes_module_callback(struct notifier_block *nb,
 					     unsigned long val, void *data)
-- 
cgit v1.2.3


From aeaeae1187d7520f1c5559623f0a149da6a1c96e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 27 Aug 2009 05:09:51 +0200
Subject: tracing: Restore the const qualifier for field names and types
 definition

Restore the const qualifier in field's name and type parameters of
trace_define_field that was lost while solving a conflict.

Fields names and types are defined as builtin constant strings in
static TRACE_EVENTs. But kprobes allocates these dynamically.

That said, we still want to always pass these strings as const char *
in trace_define_fields() to avoid any further accidental writes on
the pointed strings.

Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h  | 6 +++---
 kernel/trace/trace_events.c   | 4 ++--
 kernel/trace/trace_syscalls.c | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 1ab3089b5c59..73edf5a52e31 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -148,9 +148,9 @@ enum {
 };
 
 extern int trace_define_common_fields(struct ftrace_event_call *call);
-extern int trace_define_field(struct ftrace_event_call *call, char *type,
-			      char *name, int offset, int size, int is_signed,
-			      int filter_type);
+extern int trace_define_field(struct ftrace_event_call *call, const char *type,
+			      const char *name, int offset, int size,
+			      int is_signed, int filter_type);
 extern int trace_add_event_call(struct ftrace_event_call *call);
 extern void trace_remove_event_call(struct ftrace_event_call *call);
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 8079bb511c43..197cdaa96c43 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -27,8 +27,8 @@ DEFINE_MUTEX(event_mutex);
 
 LIST_HEAD(ftrace_events);
 
-int trace_define_field(struct ftrace_event_call *call, char *type,
-		       char *name, int offset, int size, int is_signed,
+int trace_define_field(struct ftrace_event_call *call, const char *type,
+		       const char *name, int offset, int size, int is_signed,
 		       int filter_type)
 {
 	struct ftrace_event_field *field;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5931933587e9..a928dd004535 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -193,8 +193,8 @@ int syscall_enter_define_fields(struct ftrace_event_call *call)
 		return ret;
 
 	for (i = 0; i < meta->nb_args; i++) {
-		ret = trace_define_field(call, (char *)meta->types[i],
-					 (char *)meta->args[i], offset,
+		ret = trace_define_field(call, meta->types[i],
+					 meta->args[i], offset,
 					 sizeof(unsigned long), 0,
 					 FILTER_OTHER);
 		offset += sizeof(unsigned long);
-- 
cgit v1.2.3


From 979f693def9084a452846365dfde5dcb28366333 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 22 Sep 2009 14:44:11 +0200
Subject: ratelimit: Use per ratelimit context locking

I'd like to use printk_ratelimit() in atomic context, but that's
not possible right now due to the spinlock usage this commit
introduced more than a year ago:

  717115e: printk ratelimiting rewrite

As a first step push the lock into the ratelimit state structure.
This allows us to deal with locking failures to be considered as an
event related to that state being too busy.

Also clean up the code a bit (without changing functionality):

 - tidy up the definitions

 - clean up the code flow

This also shrinks the code a tiny bit:

   text	   data	    bss	    dec	    hex	filename
    264	      0	      4	    268	    10c	ratelimit.o.before
    255	      0	      0	    255	     ff	ratelimit.o.after

( Whole-kernel data size got a bit larger, because we have
  two ratelimit-state data structures right now. )

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: David S. Miller <davem@davemloft.net>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ratelimit.h | 30 ++++++++++++++++++++----------
 lib/ratelimit.c           | 29 +++++++++++++----------------
 2 files changed, 33 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h
index 00044b856453..187bc16c1f15 100644
--- a/include/linux/ratelimit.h
+++ b/include/linux/ratelimit.h
@@ -1,20 +1,30 @@
 #ifndef _LINUX_RATELIMIT_H
 #define _LINUX_RATELIMIT_H
+
 #include <linux/param.h>
+#include <linux/spinlock_types.h>
 
-#define DEFAULT_RATELIMIT_INTERVAL (5 * HZ)
-#define DEFAULT_RATELIMIT_BURST 10
+#define DEFAULT_RATELIMIT_INTERVAL	(5 * HZ)
+#define DEFAULT_RATELIMIT_BURST		10
 
 struct ratelimit_state {
-	int interval;
-	int burst;
-	int printed;
-	int missed;
-	unsigned long begin;
+	spinlock_t	lock;		/* protect the state */
+
+	int		interval;
+	int		burst;
+	int		printed;
+	int		missed;
+	unsigned long	begin;
 };
 
-#define DEFINE_RATELIMIT_STATE(name, interval, burst)		\
-		struct ratelimit_state name = {interval, burst,}
+#define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init)		\
+									\
+	struct ratelimit_state name = {					\
+		.lock		= __SPIN_LOCK_UNLOCKED(name.lock),	\
+		.interval	= interval_init,			\
+		.burst		= burst_init,				\
+	}
 
 extern int __ratelimit(struct ratelimit_state *rs);
-#endif
+
+#endif /* _LINUX_RATELIMIT_H */
diff --git a/lib/ratelimit.c b/lib/ratelimit.c
index 26187edcc7ea..0e2c28e8a0ca 100644
--- a/lib/ratelimit.c
+++ b/lib/ratelimit.c
@@ -7,15 +7,12 @@
  * parameter. Now every user can use their own standalone ratelimit_state.
  *
  * This file is released under the GPLv2.
- *
  */
 
 #include <linux/kernel.h>
 #include <linux/jiffies.h>
 #include <linux/module.h>
 
-static DEFINE_SPINLOCK(ratelimit_lock);
-
 /*
  * __ratelimit - rate limiting
  * @rs: ratelimit_state data
@@ -26,11 +23,12 @@ static DEFINE_SPINLOCK(ratelimit_lock);
 int __ratelimit(struct ratelimit_state *rs)
 {
 	unsigned long flags;
+	int ret;
 
 	if (!rs->interval)
 		return 1;
 
-	spin_lock_irqsave(&ratelimit_lock, flags);
+	spin_lock_irqsave(&rs->lock, flags);
 	if (!rs->begin)
 		rs->begin = jiffies;
 
@@ -38,20 +36,19 @@ int __ratelimit(struct ratelimit_state *rs)
 		if (rs->missed)
 			printk(KERN_WARNING "%s: %d callbacks suppressed\n",
 				__func__, rs->missed);
-		rs->begin = 0;
+		rs->begin   = 0;
 		rs->printed = 0;
-		rs->missed = 0;
+		rs->missed  = 0;
 	}
-	if (rs->burst && rs->burst > rs->printed)
-		goto print;
-
-	rs->missed++;
-	spin_unlock_irqrestore(&ratelimit_lock, flags);
-	return 0;
+	if (rs->burst && rs->burst > rs->printed) {
+		rs->printed++;
+		ret = 1;
+	} else {
+		rs->missed++;
+		ret = 0;
+	}
+	spin_unlock_irqrestore(&rs->lock, flags);
 
-print:
-	rs->printed++;
-	spin_unlock_irqrestore(&ratelimit_lock, flags);
-	return 1;
+	return ret;
 }
 EXPORT_SYMBOL(__ratelimit);
-- 
cgit v1.2.3


From 3fff4c42bd0a89869a0eb1e7874cc06ffa4aa0f5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 22 Sep 2009 16:18:09 +0200
Subject: printk: Remove ratelimit.h from kernel.h

Decouple kernel.h from ratelimit.h: the global declaration of
printk's ratelimit_state is not needed, and it leads to messy
circular dependencies due to ratelimit.h's (new) adding of a
spinlock_types.h include.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: David S. Miller <davem@davemloft.net>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel.h     | 2 --
 include/linux/net.h        | 1 +
 kernel/printk.c            | 1 +
 kernel/sysctl.c            | 3 +++
 lib/ratelimit.c            | 2 +-
 net/core/sysctl_net_core.c | 2 ++
 net/core/utils.c           | 2 ++
 7 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2b5b1e0899a8..3305f33201be 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -15,7 +15,6 @@
 #include <linux/bitops.h>
 #include <linux/log2.h>
 #include <linux/typecheck.h>
-#include <linux/ratelimit.h>
 #include <linux/dynamic_debug.h>
 #include <asm/byteorder.h>
 #include <asm/bug.h>
@@ -241,7 +240,6 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 asmlinkage int printk(const char * fmt, ...)
 	__attribute__ ((format (printf, 1, 2))) __cold;
 
-extern struct ratelimit_state printk_ratelimit_state;
 extern int printk_ratelimit(void);
 extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 				   unsigned int interval_msec);
diff --git a/include/linux/net.h b/include/linux/net.h
index 9040a10584f7..df20f680f455 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -358,6 +358,7 @@ static const struct proto_ops name##_ops = {			\
 
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
+#include <linux/ratelimit.h>
 extern struct ratelimit_state net_ratelimit_state;
 #endif
 
diff --git a/kernel/printk.c b/kernel/printk.c
index 602033acd6c7..b997c893cdcf 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -33,6 +33,7 @@
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
 #include <linux/kexec.h>
+#include <linux/ratelimit.h>
 
 #include <asm/uaccess.h>
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1a631ba684a4..6c37048b9db9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -37,6 +37,7 @@
 #include <linux/sysrq.h>
 #include <linux/highuid.h>
 #include <linux/writeback.h>
+#include <linux/ratelimit.h>
 #include <linux/hugetlb.h>
 #include <linux/initrd.h>
 #include <linux/key.h>
@@ -155,6 +156,8 @@ extern int no_unaligned_warning;
 extern int unaligned_dump_stack;
 #endif
 
+extern struct ratelimit_state printk_ratelimit_state;
+
 #ifdef CONFIG_RT_MUTEXES
 extern int max_lock_depth;
 #endif
diff --git a/lib/ratelimit.c b/lib/ratelimit.c
index 69bfcacda16d..5551731ae1d4 100644
--- a/lib/ratelimit.c
+++ b/lib/ratelimit.c
@@ -9,7 +9,7 @@
  * This file is released under the GPLv2.
  */
 
-#include <linux/kernel.h>
+#include <linux/ratelimit.h>
 #include <linux/jiffies.h>
 #include <linux/module.h>
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 7db1de0497c6..887c03c4e3c6 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -10,7 +10,9 @@
 #include <linux/module.h>
 #include <linux/socket.h>
 #include <linux/netdevice.h>
+#include <linux/ratelimit.h>
 #include <linux/init.h>
+
 #include <net/ip.h>
 #include <net/sock.h>
 
diff --git a/net/core/utils.c b/net/core/utils.c
index 83221aee7084..838250241d26 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -24,6 +24,8 @@
 #include <linux/types.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
+#include <linux/ratelimit.h>
+
 #include <net/sock.h>
 
 #include <asm/byteorder.h>
-- 
cgit v1.2.3


From 96a2c464de07d7c72988db851c029b204fc59108 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 1 Aug 2009 01:34:24 +0200
Subject: tracing/bkl: Add bkl ftrace events

Add two events lock_kernel and unlock_kernel() to trace the bkl uses.
This opens the door for userspace tools to perform statistics about
the callsites that use it, dependencies with other locks (by pairing
the trace with lock events), use with recursivity and so on...

The {__reacquire,release}_kernel_lock() events are not traced because
these are called from schedule, thus the sched events are sufficient
to trace them.

Example of a trace:

hald-addon-stor-4152  [000]   165.875501: unlock_kernel: depth: 0, fs/block_dev.c:1358 __blkdev_put()
hald-addon-stor-4152  [000]   167.832974: lock_kernel: depth: 0, fs/block_dev.c:1167 __blkdev_get()

How to get the callsites that acquire it recursively:

cd /debug/tracing/events/bkl
echo "lock_depth > 0" > filter

firefox-4951  [001]   206.276967: unlock_kernel: depth: 1, fs/reiserfs/super.c:575 reiserfs_dirty_inode()

You can also filter by file and/or line.

v2: Use of FILTER_PTR_STRING attribute for files and lines fields to
    make them traceable.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
---
 include/linux/smp_lock.h   | 19 ++++++++++++---
 include/trace/events/bkl.h | 61 ++++++++++++++++++++++++++++++++++++++++++++++
 lib/kernel_lock.c          | 11 +++++----
 3 files changed, 82 insertions(+), 9 deletions(-)
 create mode 100644 include/trace/events/bkl.h

(limited to 'include/linux')

diff --git a/include/linux/smp_lock.h b/include/linux/smp_lock.h
index 813be59bf345..d48cc77ba70d 100644
--- a/include/linux/smp_lock.h
+++ b/include/linux/smp_lock.h
@@ -3,6 +3,7 @@
 
 #ifdef CONFIG_LOCK_KERNEL
 #include <linux/sched.h>
+#include <trace/events/bkl.h>
 
 #define kernel_locked()		(current->lock_depth >= 0)
 
@@ -24,8 +25,18 @@ static inline int reacquire_kernel_lock(struct task_struct *task)
 	return 0;
 }
 
-extern void __lockfunc lock_kernel(void)	__acquires(kernel_lock);
-extern void __lockfunc unlock_kernel(void)	__releases(kernel_lock);
+extern void __lockfunc _lock_kernel(void)	__acquires(kernel_lock);
+extern void __lockfunc _unlock_kernel(void)	__releases(kernel_lock);
+
+#define lock_kernel()	{					\
+	trace_lock_kernel(__func__, __FILE__, __LINE__);	\
+	_lock_kernel();						\
+}
+
+#define unlock_kernel()	{					\
+	trace_unlock_kernel(__func__, __FILE__, __LINE__);	\
+	_unlock_kernel();					\
+}
 
 /*
  * Various legacy drivers don't really need the BKL in a specific
@@ -41,8 +52,8 @@ static inline void cycle_kernel_lock(void)
 
 #else
 
-#define lock_kernel()				do { } while(0)
-#define unlock_kernel()				do { } while(0)
+#define lock_kernel()	   trace_lock_kernel(__func__, __FILE__, __LINE__);
+#define unlock_kernel()    trace_unlock_kernel(__func__, __FILE__, __LINE__);
 #define release_kernel_lock(task)		do { } while(0)
 #define cycle_kernel_lock()			do { } while(0)
 #define reacquire_kernel_lock(task)		0
diff --git a/include/trace/events/bkl.h b/include/trace/events/bkl.h
new file mode 100644
index 000000000000..8abd620a490e
--- /dev/null
+++ b/include/trace/events/bkl.h
@@ -0,0 +1,61 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM bkl
+
+#if !defined(_TRACE_BKL_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BKL_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(lock_kernel,
+
+	TP_PROTO(const char *func, const char *file, int line),
+
+	TP_ARGS(func, file, line),
+
+	TP_STRUCT__entry(
+		__field(	int,		lock_depth		)
+		__field_ext(	const char *,	func, FILTER_PTR_STRING	)
+		__field_ext(	const char *,	file, FILTER_PTR_STRING	)
+		__field(	int,		line			)
+	),
+
+	TP_fast_assign(
+		/* We want to record the lock_depth after lock is acquired */
+		__entry->lock_depth = current->lock_depth + 1;
+		__entry->func = func;
+		__entry->file = file;
+		__entry->line = line;
+	),
+
+	TP_printk("depth: %d, %s:%d %s()", __entry->lock_depth,
+		  __entry->file, __entry->line, __entry->func)
+);
+
+TRACE_EVENT(unlock_kernel,
+
+	TP_PROTO(const char *func, const char *file, int line),
+
+	TP_ARGS(func, file, line),
+
+	TP_STRUCT__entry(
+		__field(int,		lock_depth)
+		__field(const char *,	func)
+		__field(const char *,	file)
+		__field(int,		line)
+	),
+
+	TP_fast_assign(
+		__entry->lock_depth = current->lock_depth;
+		__entry->func = func;
+		__entry->file = file;
+		__entry->line = line;
+	),
+
+	TP_printk("depth: %d, %s:%d %s()", __entry->lock_depth,
+		  __entry->file, __entry->line, __entry->func)
+);
+
+#endif /* _TRACE_BKL_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c
index 39f1029e3525..5c10b2e1fd08 100644
--- a/lib/kernel_lock.c
+++ b/lib/kernel_lock.c
@@ -5,10 +5,11 @@
  * relegated to obsolescence, but used by various less
  * important (or lazy) subsystems.
  */
-#include <linux/smp_lock.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/semaphore.h>
+#define CREATE_TRACE_POINTS
+#include <linux/smp_lock.h>
 
 /*
  * The 'big kernel lock'
@@ -113,7 +114,7 @@ static inline void __unlock_kernel(void)
  * This cannot happen asynchronously, so we only need to
  * worry about other CPU's.
  */
-void __lockfunc lock_kernel(void)
+void __lockfunc _lock_kernel(void)
 {
 	int depth = current->lock_depth+1;
 	if (likely(!depth))
@@ -121,13 +122,13 @@ void __lockfunc lock_kernel(void)
 	current->lock_depth = depth;
 }
 
-void __lockfunc unlock_kernel(void)
+void __lockfunc _unlock_kernel(void)
 {
 	BUG_ON(current->lock_depth < 0);
 	if (likely(--current->lock_depth < 0))
 		__unlock_kernel();
 }
 
-EXPORT_SYMBOL(lock_kernel);
-EXPORT_SYMBOL(unlock_kernel);
+EXPORT_SYMBOL(_lock_kernel);
+EXPORT_SYMBOL(_unlock_kernel);
 
-- 
cgit v1.2.3


From 9f0cf4adb6aa0bfccf675c938124e68f7f06349d Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Sat, 26 Sep 2009 14:33:01 +0200
Subject: x86: Use __builtin_object_size() to validate the buffer size for
 copy_from_user()

gcc (4.x) supports the __builtin_object_size() builtin, which
reports the size of an object that a pointer point to, when known
at compile time. If the buffer size is not known at compile time, a
constant -1 is returned.

This patch uses this feature to add a sanity check to
copy_from_user(); if the target buffer is known to be smaller than
the copy size, the copy is aborted and a WARNing is emitted in
memory debug mode.

These extra checks compile away when the object size is not known,
or if both the buffer size and the copy length are constants.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
LKML-Reference: <20090926143301.2c396b94@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uaccess_32.h | 19 ++++++++++++++++++-
 arch/x86/include/asm/uaccess_64.h | 19 ++++++++++++++++++-
 arch/x86/kernel/x8664_ksyms_64.c  |  2 +-
 arch/x86/lib/copy_user_64.S       |  4 ++--
 arch/x86/lib/usercopy_32.c        |  4 ++--
 include/linux/compiler-gcc4.h     |  2 ++
 include/linux/compiler.h          |  4 ++++
 7 files changed, 47 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 632fb44b4cb5..582d6aef7417 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -187,9 +187,26 @@ __copy_from_user_inatomic_nocache(void *to, const void __user *from,
 
 unsigned long __must_check copy_to_user(void __user *to,
 					const void *from, unsigned long n);
-unsigned long __must_check copy_from_user(void *to,
+unsigned long __must_check _copy_from_user(void *to,
 					  const void __user *from,
 					  unsigned long n);
+
+static inline unsigned long __must_check copy_from_user(void *to,
+					  const void __user *from,
+					  unsigned long n)
+{
+	int sz = __compiletime_object_size(to);
+	int ret = -EFAULT;
+
+	if (likely(sz == -1 || sz >= n))
+		ret = _copy_from_user(to, from, n);
+#ifdef CONFIG_DEBUG_VM
+	else
+		WARN(1, "Buffer overflow detected!\n");
+#endif
+	return ret;
+}
+
 long __must_check strncpy_from_user(char *dst, const char __user *src,
 				    long count);
 long __must_check __strncpy_from_user(char *dst,
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index db24b215fc50..ce6fec7ce38d 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -21,10 +21,27 @@ copy_user_generic(void *to, const void *from, unsigned len);
 __must_check unsigned long
 copy_to_user(void __user *to, const void *from, unsigned len);
 __must_check unsigned long
-copy_from_user(void *to, const void __user *from, unsigned len);
+_copy_from_user(void *to, const void __user *from, unsigned len);
 __must_check unsigned long
 copy_in_user(void __user *to, const void __user *from, unsigned len);
 
+static inline unsigned long __must_check copy_from_user(void *to,
+					  const void __user *from,
+					  unsigned long n)
+{
+	int sz = __compiletime_object_size(to);
+	int ret = -EFAULT;
+
+	if (likely(sz == -1 || sz >= n))
+		ret = _copy_from_user(to, from, n);
+#ifdef CONFIG_DEBUG_VM
+	else
+		WARN(1, "Buffer overflow detected!\n");
+#endif
+	return ret;
+}
+
+
 static __always_inline __must_check
 int __copy_from_user(void *dst, const void __user *src, unsigned size)
 {
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 3909e3ba5ce3..a0cdd8cc1d67 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -30,7 +30,7 @@ EXPORT_SYMBOL(__put_user_8);
 
 EXPORT_SYMBOL(copy_user_generic);
 EXPORT_SYMBOL(__copy_user_nocache);
-EXPORT_SYMBOL(copy_from_user);
+EXPORT_SYMBOL(_copy_from_user);
 EXPORT_SYMBOL(copy_to_user);
 EXPORT_SYMBOL(__copy_from_user_inatomic);
 
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 6ba0f7bb85ea..4be3c415b3e9 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -78,7 +78,7 @@ ENTRY(copy_to_user)
 ENDPROC(copy_to_user)
 
 /* Standard copy_from_user with segment limit checking */
-ENTRY(copy_from_user)
+ENTRY(_copy_from_user)
 	CFI_STARTPROC
 	GET_THREAD_INFO(%rax)
 	movq %rsi,%rcx
@@ -88,7 +88,7 @@ ENTRY(copy_from_user)
 	jae bad_from_user
 	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
 	CFI_ENDPROC
-ENDPROC(copy_from_user)
+ENDPROC(_copy_from_user)
 
 ENTRY(copy_user_generic)
 	CFI_STARTPROC
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 1f118d462acc..8498684e45b0 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -874,7 +874,7 @@ EXPORT_SYMBOL(copy_to_user);
  * data to the requested size using zero bytes.
  */
 unsigned long
-copy_from_user(void *to, const void __user *from, unsigned long n)
+_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
 	if (access_ok(VERIFY_READ, from, n))
 		n = __copy_from_user(to, from, n);
@@ -882,4 +882,4 @@ copy_from_user(void *to, const void __user *from, unsigned long n)
 		memset(to, 0, n);
 	return n;
 }
-EXPORT_SYMBOL(copy_from_user);
+EXPORT_SYMBOL(_copy_from_user);
diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
index 450fa597c94d..a3aef5d55dba 100644
--- a/include/linux/compiler-gcc4.h
+++ b/include/linux/compiler-gcc4.h
@@ -37,3 +37,5 @@
 #define __cold			__attribute__((__cold__))
 
 #endif
+
+#define __compiletime_object_size(obj) __builtin_object_size(obj, 0)
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 04fb5135b4e1..8e54108688f9 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -266,6 +266,10 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
 #endif
 
+/* Compile time object size, -1 for unknown */
+#ifndef __compiletime_object_size
+# define __compiletime_object_size(obj) -1
+#endif
 /*
  * Prevent the compiler from merging or refetching accesses.  The compiler
  * is also forbidden from reordering successive instances of ACCESS_ONCE(),
-- 
cgit v1.2.3


From 925936ebf35a95c290e010b784c962164e6728f3 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 28 Sep 2009 17:12:49 +0200
Subject: tracing: Pushdown the bkl tracepoints calls

Currently we are calling the bkl tracepoint callbacks just before the
bkl lock/unlock operations, ie the tracepoint call is not inside a
lock_kernel() function but inside a lock_kernel() macro. Hence the
bkl trace event header must be included from smp_lock.h. This raises
some nasty circular header dependencies:

linux/smp_lock.h -> trace/events/bkl.h -> trace/define_trace.h
-> trace/ftrace.h -> linux/ftrace_event.h -> linux/hardirq.h
-> linux/smp_lock.h

This results in incomplete event declarations, spurious event
definitions and other kind of funny behaviours.

This is hardly fixable without ugly workarounds. So instead, we push
the file name, line number and function name as lock_kernel()
parameters, so that we only deal with the trace event header from
lib/kernel_lock.c

This adds two parameters to lock_kernel() and unlock_kernel() but
it should be fine wrt to performances because this pair dos not seem
to be called in fast paths.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Li Zefan <lizf@cn.fujitsu.com>
---
 include/linux/smp_lock.h | 28 +++++++++++++++-------------
 lib/kernel_lock.c        | 15 +++++++++++----
 2 files changed, 26 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/smp_lock.h b/include/linux/smp_lock.h
index d48cc77ba70d..2ea1dd1ba21c 100644
--- a/include/linux/smp_lock.h
+++ b/include/linux/smp_lock.h
@@ -3,7 +3,6 @@
 
 #ifdef CONFIG_LOCK_KERNEL
 #include <linux/sched.h>
-#include <trace/events/bkl.h>
 
 #define kernel_locked()		(current->lock_depth >= 0)
 
@@ -25,18 +24,21 @@ static inline int reacquire_kernel_lock(struct task_struct *task)
 	return 0;
 }
 
-extern void __lockfunc _lock_kernel(void)	__acquires(kernel_lock);
-extern void __lockfunc _unlock_kernel(void)	__releases(kernel_lock);
+extern void __lockfunc
+_lock_kernel(const char *func, const char *file, int line)
+__acquires(kernel_lock);
 
-#define lock_kernel()	{					\
-	trace_lock_kernel(__func__, __FILE__, __LINE__);	\
-	_lock_kernel();						\
-}
+extern void __lockfunc
+_unlock_kernel(const char *func, const char *file, int line)
+__releases(kernel_lock);
 
-#define unlock_kernel()	{					\
-	trace_unlock_kernel(__func__, __FILE__, __LINE__);	\
-	_unlock_kernel();					\
-}
+#define lock_kernel() do {					\
+	_lock_kernel(__func__, __FILE__, __LINE__);		\
+} while (0)
+
+#define unlock_kernel()	do {					\
+	_unlock_kernel(__func__, __FILE__, __LINE__);		\
+} while (0)
 
 /*
  * Various legacy drivers don't really need the BKL in a specific
@@ -52,8 +54,8 @@ static inline void cycle_kernel_lock(void)
 
 #else
 
-#define lock_kernel()	   trace_lock_kernel(__func__, __FILE__, __LINE__);
-#define unlock_kernel()    trace_unlock_kernel(__func__, __FILE__, __LINE__);
+#define lock_kernel()
+#define unlock_kernel()
 #define release_kernel_lock(task)		do { } while(0)
 #define cycle_kernel_lock()			do { } while(0)
 #define reacquire_kernel_lock(task)		0
diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c
index 5c10b2e1fd08..4ebfa5a164d7 100644
--- a/lib/kernel_lock.c
+++ b/lib/kernel_lock.c
@@ -8,9 +8,11 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/semaphore.h>
-#define CREATE_TRACE_POINTS
 #include <linux/smp_lock.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/bkl.h>
+
 /*
  * The 'big kernel lock'
  *
@@ -114,19 +116,24 @@ static inline void __unlock_kernel(void)
  * This cannot happen asynchronously, so we only need to
  * worry about other CPU's.
  */
-void __lockfunc _lock_kernel(void)
+void __lockfunc _lock_kernel(const char *func, const char *file, int line)
 {
-	int depth = current->lock_depth+1;
+	int depth = current->lock_depth + 1;
+
+	trace_lock_kernel(func, file, line);
+
 	if (likely(!depth))
 		__lock_kernel();
 	current->lock_depth = depth;
 }
 
-void __lockfunc _unlock_kernel(void)
+void __lockfunc _unlock_kernel(const char *func, const char *file, int line)
 {
 	BUG_ON(current->lock_depth < 0);
 	if (likely(--current->lock_depth < 0))
 		__unlock_kernel();
+
+	trace_unlock_kernel(func, file, line);
 }
 
 EXPORT_SYMBOL(_lock_kernel);
-- 
cgit v1.2.3


From 4a3127693001c61a21d1ce680db6340623f52e93 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Wed, 30 Sep 2009 13:05:23 +0200
Subject: x86: Turn the copy_from_user check into an (optional) compile time
 warning

A previous patch added the buffer size check to copy_from_user().

One of the things learned from analyzing the result of the previous
patch is that in general, gcc is really good at proving that the
code contains sufficient security checks to not need to do a
runtime check. But that for those cases where gcc could not prove
this, there was a relatively high percentage of real security
issues.

This patch turns the case of "gcc cannot prove" into a compile time
warning, as long as a sufficiently new gcc is in use that supports
this. The objective is that these warnings will trigger developers
checking new cases out before a security hole enters a linux kernel
release.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: James Morris <jmorris@namei.org>
Cc: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <20090930130523.348ae6c4@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uaccess_32.h | 12 +++++++++---
 arch/x86/lib/usercopy_32.c        |  6 ++++++
 include/linux/compiler-gcc4.h     |  3 +++
 include/linux/compiler.h          |  4 ++++
 4 files changed, 22 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 582d6aef7417..952f9e793c3e 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -191,6 +191,13 @@ unsigned long __must_check _copy_from_user(void *to,
 					  const void __user *from,
 					  unsigned long n);
 
+
+extern void copy_from_user_overflow(void)
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+	__compiletime_warning("copy_from_user() buffer size is not provably correct")
+#endif
+;
+
 static inline unsigned long __must_check copy_from_user(void *to,
 					  const void __user *from,
 					  unsigned long n)
@@ -200,10 +207,9 @@ static inline unsigned long __must_check copy_from_user(void *to,
 
 	if (likely(sz == -1 || sz >= n))
 		ret = _copy_from_user(to, from, n);
-#ifdef CONFIG_DEBUG_VM
 	else
-		WARN(1, "Buffer overflow detected!\n");
-#endif
+		copy_from_user_overflow();
+
 	return ret;
 }
 
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 8498684e45b0..e218d5df85ff 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -883,3 +883,9 @@ _copy_from_user(void *to, const void __user *from, unsigned long n)
 	return n;
 }
 EXPORT_SYMBOL(_copy_from_user);
+
+void copy_from_user_overflow(void)
+{
+	WARN(1, "Buffer overflow detected!\n");
+}
+EXPORT_SYMBOL(copy_from_user_overflow);
diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
index a3aef5d55dba..f1709c1f9eae 100644
--- a/include/linux/compiler-gcc4.h
+++ b/include/linux/compiler-gcc4.h
@@ -39,3 +39,6 @@
 #endif
 
 #define __compiletime_object_size(obj) __builtin_object_size(obj, 0)
+#if __GNUC_MINOR__ >= 4
+#define __compiletime_warning(message) __attribute__((warning(message)))
+#endif
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 8e54108688f9..950356311f12 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -270,6 +270,10 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 #ifndef __compiletime_object_size
 # define __compiletime_object_size(obj) -1
 #endif
+#ifndef __compiletime_warning
+# define __compiletime_warning(message)
+#endif
+
 /*
  * Prevent the compiler from merging or refetching accesses.  The compiler
  * is also forbidden from reordering successive instances of ACCESS_ONCE(),
-- 
cgit v1.2.3


From 7c68af6e32c73992bad24107311f3433c89016e2 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sat, 19 Sep 2009 09:40:22 +0300
Subject: core, x86: Add user return notifiers

Add a general per-cpu notifier that is called whenever the kernel is
about to return to userspace.  The notifier uses a thread_info flag
and existing checks, so there is no impact on user return or context
switch fast paths.

This will be used initially to speed up KVM task switching by lazily
updating MSRs.

Signed-off-by: Avi Kivity <avi@redhat.com>
LKML-Reference: <1253342422-13811-1-git-send-email-avi@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/Kconfig                         | 10 ++++++++
 arch/x86/Kconfig                     |  1 +
 arch/x86/include/asm/thread_info.h   |  7 ++++--
 arch/x86/kernel/process.c            |  2 ++
 arch/x86/kernel/signal.c             |  3 +++
 include/linux/user-return-notifier.h | 42 ++++++++++++++++++++++++++++++++
 kernel/user-return-notifier.c        | 46 ++++++++++++++++++++++++++++++++++++
 7 files changed, 109 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/user-return-notifier.h
 create mode 100644 kernel/user-return-notifier.c

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 7f418bbc261a..4e312fffbfd7 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -83,6 +83,13 @@ config KRETPROBES
 	def_bool y
 	depends on KPROBES && HAVE_KRETPROBES
 
+config USER_RETURN_NOTIFIER
+	bool
+	depends on HAVE_USER_RETURN_NOTIFIER
+	help
+	  Provide a kernel-internal notification when a cpu is about to
+	  switch to user mode.
+
 config HAVE_IOREMAP_PROT
 	bool
 
@@ -126,4 +133,7 @@ config HAVE_DMA_API_DEBUG
 config HAVE_DEFAULT_NO_SPIN_MUTEXES
 	bool
 
+config HAVE_USER_RETURN_NOTIFIER
+	bool
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8da93745c087..1df175d15aa8 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -50,6 +50,7 @@ config X86
 	select HAVE_KERNEL_BZIP2
 	select HAVE_KERNEL_LZMA
 	select HAVE_ARCH_KMEMCHECK
+	select HAVE_USER_RETURN_NOTIFIER
 
 config OUTPUT_FORMAT
 	string
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index d27d0a2fec4c..375c917c37d2 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,6 +83,7 @@ struct thread_info {
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
+#define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* 32bit process */
 #define TIF_FORK		18	/* ret_from_fork */
@@ -107,6 +108,7 @@ struct thread_info {
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
+#define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
 #define _TIF_IA32		(1 << TIF_IA32)
 #define _TIF_FORK		(1 << TIF_FORK)
@@ -142,13 +144,14 @@ struct thread_info {
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK						\
-	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
+	(_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME |	\
+	 _TIF_USER_RETURN_NOTIFY)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
 	(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
 
-#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
+#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
 
 #define PREEMPT_ACTIVE		0x10000000
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5284cd2b5776..e51b056fc88f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,6 +9,7 @@
 #include <linux/pm.h>
 #include <linux/clockchips.h>
 #include <linux/random.h>
+#include <linux/user-return-notifier.h>
 #include <trace/events/power.h>
 #include <asm/system.h>
 #include <asm/apic.h>
@@ -224,6 +225,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		 */
 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
 	}
+	propagate_user_return_notify(prev_p, next_p);
 }
 
 int sys_fork(struct pt_regs *regs)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 6a44a76055ad..c49f90f7957a 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -19,6 +19,7 @@
 #include <linux/stddef.h>
 #include <linux/personality.h>
 #include <linux/uaccess.h>
+#include <linux/user-return-notifier.h>
 
 #include <asm/processor.h>
 #include <asm/ucontext.h>
@@ -872,6 +873,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 		if (current->replacement_session_keyring)
 			key_replace_session_keyring();
 	}
+	if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
+		fire_user_return_notifiers();
 
 #ifdef CONFIG_X86_32
 	clear_thread_flag(TIF_IRET);
diff --git a/include/linux/user-return-notifier.h b/include/linux/user-return-notifier.h
new file mode 100644
index 000000000000..b6ac056291d7
--- /dev/null
+++ b/include/linux/user-return-notifier.h
@@ -0,0 +1,42 @@
+#ifndef _LINUX_USER_RETURN_NOTIFIER_H
+#define _LINUX_USER_RETURN_NOTIFIER_H
+
+#ifdef CONFIG_USER_RETURN_NOTIFIER
+
+#include <linux/list.h>
+#include <linux/sched.h>
+
+struct user_return_notifier {
+	void (*on_user_return)(struct user_return_notifier *urn);
+	struct hlist_node link;
+};
+
+
+void user_return_notifier_register(struct user_return_notifier *urn);
+void user_return_notifier_unregister(struct user_return_notifier *urn);
+
+static inline void propagate_user_return_notify(struct task_struct *prev,
+						struct task_struct *next)
+{
+	if (test_tsk_thread_flag(prev, TIF_USER_RETURN_NOTIFY)) {
+		clear_tsk_thread_flag(prev, TIF_USER_RETURN_NOTIFY);
+		set_tsk_thread_flag(next, TIF_USER_RETURN_NOTIFY);
+	}
+}
+
+void fire_user_return_notifiers(void);
+
+#else
+
+struct user_return_notifier {};
+
+static inline void propagate_user_return_notify(struct task_struct *prev,
+						struct task_struct *next)
+{
+}
+
+static inline void fire_user_return_notifiers(void) {}
+
+#endif
+
+#endif
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
new file mode 100644
index 000000000000..530ccb816513
--- /dev/null
+++ b/kernel/user-return-notifier.c
@@ -0,0 +1,46 @@
+
+#include <linux/user-return-notifier.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+
+static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
+
+#define URN_LIST_HEAD per_cpu(return_notifier_list, raw_smp_processor_id())
+
+/*
+ * Request a notification when the current cpu returns to userspace.  Must be
+ * called in atomic context.  The notifier will also be called in atomic
+ * context.
+ */
+void user_return_notifier_register(struct user_return_notifier *urn)
+{
+	set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
+	hlist_add_head(&urn->link, &URN_LIST_HEAD);
+}
+EXPORT_SYMBOL_GPL(user_return_notifier_register);
+
+/*
+ * Removes a registered user return notifier.  Must be called from atomic
+ * context, and from the same cpu registration occured in.
+ */
+void user_return_notifier_unregister(struct user_return_notifier *urn)
+{
+	hlist_del(&urn->link);
+	if (hlist_empty(&URN_LIST_HEAD))
+		clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
+}
+EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
+
+/* Calls registered user return notifiers */
+void fire_user_return_notifiers(void)
+{
+	struct user_return_notifier *urn;
+	struct hlist_node *tmp1, *tmp2;
+	struct hlist_head *head;
+
+	head = &get_cpu_var(return_notifier_list);
+	hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link)
+		urn->on_user_return(urn);
+	put_cpu_var();
+}
-- 
cgit v1.2.3


From 1122a26f2abe4245ccdaed95ec23f63fe086b332 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 30 Sep 2009 13:52:12 +0200
Subject: block: use normal I/O path for discard requests

prepare_discard_fn() was being called in a place where memory allocation
was effectively impossible.  This makes it inappropriate for all but
the most trivial translations of Linux's DISCARD operation to the block
command set.  Additionally adding a payload there makes the ownership
of the bio backing unclear as it's now allocated by the device driver
and not the submitter as usual.

It is replaced with QUEUE_FLAG_DISCARD which is used to indicate whether
the queue supports discard operations or not.  blkdev_issue_discard now
allocates a one-page, sector-length payload which is the right thing
for the common ATA and SCSI implementations.

The mtd implementation of prepare_discard_fn() is replaced with simply
checking for the request being a discard.

Largely based on a previous patch from Matthew Wilcox <matthew@wil.cx>
which did the prepare_discard_fn but not the different payload allocation
yet.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-barrier.c         | 35 ++++++++++++++++++++++++++++++-----
 block/blk-core.c            |  3 +--
 block/blk-settings.c        | 17 -----------------
 drivers/mtd/mtd_blkdevs.c   | 19 +++++--------------
 drivers/staging/dst/dcore.c |  2 +-
 include/linux/blkdev.h      |  6 ++----
 6 files changed, 39 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 6593ab39cfe9..21f5025c3945 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -350,6 +350,7 @@ static void blkdev_discard_end_io(struct bio *bio, int err)
 
 	if (bio->bi_private)
 		complete(bio->bi_private);
+	__free_page(bio_page(bio));
 
 	bio_put(bio);
 }
@@ -372,26 +373,44 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	struct request_queue *q = bdev_get_queue(bdev);
 	int type = flags & DISCARD_FL_BARRIER ?
 		DISCARD_BARRIER : DISCARD_NOBARRIER;
+	struct bio *bio;
+	struct page *page;
 	int ret = 0;
 
 	if (!q)
 		return -ENXIO;
 
-	if (!q->prepare_discard_fn)
+	if (!blk_queue_discard(q))
 		return -EOPNOTSUPP;
 
 	while (nr_sects && !ret) {
-		struct bio *bio = bio_alloc(gfp_mask, 0);
-		if (!bio)
-			return -ENOMEM;
+		unsigned int sector_size = q->limits.logical_block_size;
 
+		bio = bio_alloc(gfp_mask, 1);
+		if (!bio)
+			goto out;
+		bio->bi_sector = sector;
 		bio->bi_end_io = blkdev_discard_end_io;
 		bio->bi_bdev = bdev;
 		if (flags & DISCARD_FL_WAIT)
 			bio->bi_private = &wait;
 
-		bio->bi_sector = sector;
+		/*
+		 * Add a zeroed one-sector payload as that's what
+		 * our current implementations need.  If we'll ever need
+		 * more the interface will need revisiting.
+		 */
+		page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		if (!page)
+			goto out_free_bio;
+		if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
+			goto out_free_page;
 
+		/*
+		 * And override the bio size - the way discard works we
+		 * touch many more blocks on disk than the actual payload
+		 * length.
+		 */
 		if (nr_sects > queue_max_hw_sectors(q)) {
 			bio->bi_size = queue_max_hw_sectors(q) << 9;
 			nr_sects -= queue_max_hw_sectors(q);
@@ -414,5 +433,11 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		bio_put(bio);
 	}
 	return ret;
+out_free_page:
+	__free_page(page);
+out_free_bio:
+	bio_put(bio);
+out:
+	return -ENOMEM;
 }
 EXPORT_SYMBOL(blkdev_issue_discard);
diff --git a/block/blk-core.c b/block/blk-core.c
index 8135228e4b29..80a020dd1580 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1124,7 +1124,6 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 		req->cmd_flags |= REQ_DISCARD;
 		if (bio_rw_flagged(bio, BIO_RW_BARRIER))
 			req->cmd_flags |= REQ_SOFTBARRIER;
-		req->q->prepare_discard_fn(req->q, req);
 	} else if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)))
 		req->cmd_flags |= REQ_HARDBARRIER;
 
@@ -1470,7 +1469,7 @@ static inline void __generic_make_request(struct bio *bio)
 			goto end_io;
 
 		if (bio_rw_flagged(bio, BIO_RW_DISCARD) &&
-		    !q->prepare_discard_fn) {
+		    !blk_queue_discard(q)) {
 			err = -EOPNOTSUPP;
 			goto end_io;
 		}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index eaf122ff5f16..d29498ef1eb5 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -33,23 +33,6 @@ void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
 }
 EXPORT_SYMBOL(blk_queue_prep_rq);
 
-/**
- * blk_queue_set_discard - set a discard_sectors function for queue
- * @q:		queue
- * @dfn:	prepare_discard function
- *
- * It's possible for a queue to register a discard callback which is used
- * to transform a discard request into the appropriate type for the
- * hardware. If none is registered, then discard requests are failed
- * with %EOPNOTSUPP.
- *
- */
-void blk_queue_set_discard(struct request_queue *q, prepare_discard_fn *dfn)
-{
-	q->prepare_discard_fn = dfn;
-}
-EXPORT_SYMBOL(blk_queue_set_discard);
-
 /**
  * blk_queue_merge_bvec - set a merge_bvec function for queue
  * @q:		queue
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 0acbf4f5be50..8ca17a3e96ea 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -32,14 +32,6 @@ struct mtd_blkcore_priv {
 	spinlock_t queue_lock;
 };
 
-static int blktrans_discard_request(struct request_queue *q,
-				    struct request *req)
-{
-	req->cmd_type = REQ_TYPE_LINUX_BLOCK;
-	req->cmd[0] = REQ_LB_OP_DISCARD;
-	return 0;
-}
-
 static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 			       struct mtd_blktrans_dev *dev,
 			       struct request *req)
@@ -52,10 +44,6 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 
 	buf = req->buffer;
 
-	if (req->cmd_type == REQ_TYPE_LINUX_BLOCK &&
-	    req->cmd[0] == REQ_LB_OP_DISCARD)
-		return tr->discard(dev, block, nsect);
-
 	if (!blk_fs_request(req))
 		return -EIO;
 
@@ -63,6 +51,9 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 	    get_capacity(req->rq_disk))
 		return -EIO;
 
+	if (blk_discard_rq(req))
+		return tr->discard(dev, block, nsect);
+
 	switch(rq_data_dir(req)) {
 	case READ:
 		for (; nsect > 0; nsect--, block++, buf += tr->blksize)
@@ -380,8 +371,8 @@ int register_mtd_blktrans(struct mtd_blktrans_ops *tr)
 	tr->blkcore_priv->rq->queuedata = tr;
 	blk_queue_logical_block_size(tr->blkcore_priv->rq, tr->blksize);
 	if (tr->discard)
-		blk_queue_set_discard(tr->blkcore_priv->rq,
-				      blktrans_discard_request);
+		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
+					tr->blkcore_priv->rq);
 
 	tr->blkshift = ffs(tr->blksize) - 1;
 
diff --git a/drivers/staging/dst/dcore.c b/drivers/staging/dst/dcore.c
index ac8577358ba0..5e8db0677582 100644
--- a/drivers/staging/dst/dcore.c
+++ b/drivers/staging/dst/dcore.c
@@ -102,7 +102,7 @@ static int dst_request(struct request_queue *q, struct bio *bio)
 	struct dst_node *n = q->queuedata;
 	int err = -EIO;
 
-	if (bio_empty_barrier(bio) && !q->prepare_discard_fn) {
+	if (bio_empty_barrier(bio) && !blk_queue_discard(q)) {
 		/*
 		 * This is a dirty^Wnice hack, but if we complete this
 		 * operation with -EOPNOTSUPP like intended, XFS
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e23a86cae5ac..f62d45e87618 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -82,7 +82,6 @@ enum rq_cmd_type_bits {
 enum {
 	REQ_LB_OP_EJECT	= 0x40,		/* eject request */
 	REQ_LB_OP_FLUSH = 0x41,		/* flush request */
-	REQ_LB_OP_DISCARD = 0x42,	/* discard sectors */
 };
 
 /*
@@ -261,7 +260,6 @@ typedef void (request_fn_proc) (struct request_queue *q);
 typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);
 typedef int (prep_rq_fn) (struct request_queue *, struct request *);
 typedef void (unplug_fn) (struct request_queue *);
-typedef int (prepare_discard_fn) (struct request_queue *, struct request *);
 
 struct bio_vec;
 struct bvec_merge_data {
@@ -340,7 +338,6 @@ struct request_queue
 	make_request_fn		*make_request_fn;
 	prep_rq_fn		*prep_rq_fn;
 	unplug_fn		*unplug_fn;
-	prepare_discard_fn	*prepare_discard_fn;
 	merge_bvec_fn		*merge_bvec_fn;
 	prepare_flush_fn	*prepare_flush_fn;
 	softirq_done_fn		*softirq_done_fn;
@@ -460,6 +457,7 @@ struct request_queue
 #define QUEUE_FLAG_VIRT        QUEUE_FLAG_NONROT /* paravirt device */
 #define QUEUE_FLAG_IO_STAT     15	/* do IO stats */
 #define QUEUE_FLAG_CQ	       16	/* hardware does queuing */
+#define QUEUE_FLAG_DISCARD     17	/* supports DISCARD */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_CLUSTER) |		\
@@ -591,6 +589,7 @@ enum {
 #define blk_queue_flushing(q)	((q)->ordseq)
 #define blk_queue_stackable(q)	\
 	test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
+#define blk_queue_discard(q)	test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
 
 #define blk_fs_request(rq)	((rq)->cmd_type == REQ_TYPE_FS)
 #define blk_pc_request(rq)	((rq)->cmd_type == REQ_TYPE_BLOCK_PC)
@@ -955,7 +954,6 @@ extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *);
 extern void blk_queue_dma_alignment(struct request_queue *, int);
 extern void blk_queue_update_dma_alignment(struct request_queue *, int);
 extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
-extern void blk_queue_set_discard(struct request_queue *, prepare_discard_fn *);
 extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
-- 
cgit v1.2.3


From ca80650cfbde5b17a5fa957a261c7973f84599a7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 30 Sep 2009 13:54:20 +0200
Subject: block: allow large discard requests

Currently we set the bio size to the byte equivalent of the blocks to
be trimmed when submitting the initial DISCARD ioctl.  That means it
is subject to the max_hw_sectors limitation of the HBA which is
much lower than the size of a DISCARD request we can support.
Add a separate max_discard_sectors tunable to limit the size for discard
requests.

We limit the max discard request size in bytes to 32bit as that is the
limit for bio->bi_size.  This could be much larger if we had a way to pass
that information through the block layer.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c    | 10 ++++++----
 block/blk-core.c       |  3 ++-
 block/blk-settings.c   | 13 +++++++++++++
 include/linux/blkdev.h |  3 +++
 4 files changed, 24 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 21f5025c3945..8873b9b439ff 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -385,6 +385,8 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 
 	while (nr_sects && !ret) {
 		unsigned int sector_size = q->limits.logical_block_size;
+		unsigned int max_discard_sectors =
+			min(q->limits.max_discard_sectors, UINT_MAX >> 9);
 
 		bio = bio_alloc(gfp_mask, 1);
 		if (!bio)
@@ -411,10 +413,10 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		 * touch many more blocks on disk than the actual payload
 		 * length.
 		 */
-		if (nr_sects > queue_max_hw_sectors(q)) {
-			bio->bi_size = queue_max_hw_sectors(q) << 9;
-			nr_sects -= queue_max_hw_sectors(q);
-			sector += queue_max_hw_sectors(q);
+		if (nr_sects > max_discard_sectors) {
+			bio->bi_size = max_discard_sectors << 9;
+			nr_sects -= max_discard_sectors;
+			sector += max_discard_sectors;
 		} else {
 			bio->bi_size = nr_sects << 9;
 			nr_sects = 0;
diff --git a/block/blk-core.c b/block/blk-core.c
index 80a020dd1580..34504f309728 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1436,7 +1436,8 @@ static inline void __generic_make_request(struct bio *bio)
 			goto end_io;
 		}
 
-		if (unlikely(nr_sectors > queue_max_hw_sectors(q))) {
+		if (unlikely(!bio_rw_flagged(bio, BIO_RW_DISCARD) &&
+			     nr_sectors > queue_max_hw_sectors(q))) {
 			printk(KERN_ERR "bio too big device %s (%u > %u)\n",
 			       bdevname(bio->bi_bdev, b),
 			       bio_sectors(bio),
diff --git a/block/blk-settings.c b/block/blk-settings.c
index d29498ef1eb5..e0695bca7027 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -96,6 +96,7 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->max_segment_size = MAX_SEGMENT_SIZE;
 	lim->max_sectors = BLK_DEF_MAX_SECTORS;
 	lim->max_hw_sectors = INT_MAX;
+	lim->max_discard_sectors = SAFE_MAX_SECTORS;
 	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
 	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
 	lim->alignment_offset = 0;
@@ -238,6 +239,18 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_sectors)
 }
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
 
+/**
+ * blk_queue_max_discard_sectors - set max sectors for a single discard
+ * @q:  the request queue for the device
+ * @max_discard: maximum number of sectors to discard
+ **/
+void blk_queue_max_discard_sectors(struct request_queue *q,
+		unsigned int max_discard_sectors)
+{
+	q->limits.max_discard_sectors = max_discard_sectors;
+}
+EXPORT_SYMBOL(blk_queue_max_discard_sectors);
+
 /**
  * blk_queue_max_phys_segments - set max phys segments for a request for this queue
  * @q:  the request queue for the device
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f62d45e87618..1a03b715dfad 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -311,6 +311,7 @@ struct queue_limits {
 	unsigned int		alignment_offset;
 	unsigned int		io_min;
 	unsigned int		io_opt;
+	unsigned int		max_discard_sectors;
 
 	unsigned short		logical_block_size;
 	unsigned short		max_hw_segments;
@@ -928,6 +929,8 @@ extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
+extern void blk_queue_max_discard_sectors(struct request_queue *q,
+		unsigned int max_discard_sectors);
 extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
 extern void blk_queue_physical_block_size(struct request_queue *, unsigned short);
 extern void blk_queue_alignment_offset(struct request_queue *q,
-- 
cgit v1.2.3


From 1a35e0f6443f4266dad4c569c55c57a9032596fa Mon Sep 17 00:00:00 2001
From: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Date: Thu, 1 Oct 2009 21:16:13 +0200
Subject: Add a tracepoint for block request remapping

Since 2.6.31 now has request-based device-mapper, it's useful to have
a tracepoint for request-remapping as well as bio-remapping.
This patch adds a tracepoint for request-remapping, trace_block_rq_remap().

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Alasdair G Kergon <agk@redhat.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c             |  1 +
 include/linux/blktrace_api.h |  2 +-
 include/trace/events/block.h | 33 +++++++++++++++++++++++++++++++++
 kernel/trace/blktrace.c      | 34 ++++++++++++++++++++++++++++++++++
 4 files changed, 69 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 34504f309728..ddaaea4fdffc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -34,6 +34,7 @@
 #include "blk.h"
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
 
 static int __make_request(struct request_queue *q, struct bio *bio);
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 622939a23299..3b73b9992b26 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -212,7 +212,7 @@ extern struct attribute_group blk_trace_attr_group;
 # define blk_trace_startstop(q, start)			(-ENOTTY)
 # define blk_trace_remove(q)				(-ENOTTY)
 # define blk_add_trace_msg(q, fmt, ...)			do { } while (0)
-# define blk_trace_remove_sysfs(struct device *dev)	do { } while (0)
+# define blk_trace_remove_sysfs(dev)			do { } while (0)
 static inline int blk_trace_init_sysfs(struct device *dev)
 {
 	return 0;
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index d86af94691c2..00405b5f624a 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -488,6 +488,39 @@ TRACE_EVENT(block_remap,
 		  (unsigned long long)__entry->old_sector)
 );
 
+TRACE_EVENT(block_rq_remap,
+
+	TP_PROTO(struct request_queue *q, struct request *rq, dev_t dev,
+		 sector_t from),
+
+	TP_ARGS(q, rq, dev, from),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev		)
+		__field( sector_t,	sector		)
+		__field( unsigned int,	nr_sector	)
+		__field( dev_t,		old_dev		)
+		__field( sector_t,	old_sector	)
+		__array( char,		rwbs,	6	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= disk_devt(rq->rq_disk);
+		__entry->sector		= blk_rq_pos(rq);
+		__entry->nr_sector	= blk_rq_sectors(rq);
+		__entry->old_dev	= dev;
+		__entry->old_sector	= from;
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+	),
+
+	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector,
+		  MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
+		  (unsigned long long)__entry->old_sector)
+);
+
 #endif /* _TRACE_BLOCK_H */
 
 /* This part must be outside protection */
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 60b5c5a3d4b4..d9d6206e0b14 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -855,6 +855,37 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
 			sizeof(r), &r);
 }
 
+/**
+ * blk_add_trace_rq_remap - Add a trace for a request-remap operation
+ * @q:		queue the io is for
+ * @rq:		the source request
+ * @dev:	target device
+ * @from:	source sector
+ *
+ * Description:
+ *     Device mapper remaps request to other devices.
+ *     Add a trace for that action.
+ *
+ **/
+static void blk_add_trace_rq_remap(struct request_queue *q,
+				   struct request *rq, dev_t dev,
+				   sector_t from)
+{
+	struct blk_trace *bt = q->blk_trace;
+	struct blk_io_trace_remap r;
+
+	if (likely(!bt))
+		return;
+
+	r.device_from = cpu_to_be32(dev);
+	r.device_to   = cpu_to_be32(disk_devt(rq->rq_disk));
+	r.sector_from = cpu_to_be64(from);
+
+	__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
+			rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors,
+			sizeof(r), &r);
+}
+
 /**
  * blk_add_driver_data - Add binary message with driver-specific data
  * @q:		queue the io is for
@@ -922,10 +953,13 @@ static void blk_register_tracepoints(void)
 	WARN_ON(ret);
 	ret = register_trace_block_remap(blk_add_trace_remap);
 	WARN_ON(ret);
+	ret = register_trace_block_rq_remap(blk_add_trace_rq_remap);
+	WARN_ON(ret);
 }
 
 static void blk_unregister_tracepoints(void)
 {
+	unregister_trace_block_rq_remap(blk_add_trace_rq_remap);
 	unregister_trace_block_remap(blk_add_trace_remap);
 	unregister_trace_block_split(blk_add_trace_split);
 	unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
-- 
cgit v1.2.3


From b411b3637fa71fce9cf2acf0639009500f5892fe Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Fri, 25 Sep 2009 16:07:19 -0700
Subject: The DRBD driver

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 .../blockdev/drbd/DRBD-8.3-data-packets.svg        |  588 +++
 Documentation/blockdev/drbd/DRBD-data-packets.svg  |  459 ++
 Documentation/blockdev/drbd/README.txt             |   16 +
 Documentation/blockdev/drbd/conn-states-8.dot      |   18 +
 Documentation/blockdev/drbd/disk-states-8.dot      |   16 +
 .../drbd/drbd-connection-state-overview.dot        |   85 +
 Documentation/blockdev/drbd/node-states-8.dot      |   14 +
 MAINTAINERS                                        |   13 +
 drivers/block/Kconfig                              |    2 +
 drivers/block/Makefile                             |    1 +
 drivers/block/drbd/Kconfig                         |   82 +
 drivers/block/drbd/Makefile                        |    8 +
 drivers/block/drbd/drbd_actlog.c                   | 1484 +++++++
 drivers/block/drbd/drbd_bitmap.c                   | 1327 ++++++
 drivers/block/drbd/drbd_int.h                      | 2258 ++++++++++
 drivers/block/drbd/drbd_main.c                     | 3735 ++++++++++++++++
 drivers/block/drbd/drbd_nl.c                       | 2365 +++++++++++
 drivers/block/drbd/drbd_proc.c                     |  266 ++
 drivers/block/drbd/drbd_receiver.c                 | 4456 ++++++++++++++++++++
 drivers/block/drbd/drbd_req.c                      | 1132 +++++
 drivers/block/drbd/drbd_req.h                      |  327 ++
 drivers/block/drbd/drbd_strings.c                  |  113 +
 drivers/block/drbd/drbd_tracing.c                  |  752 ++++
 drivers/block/drbd/drbd_tracing.h                  |   87 +
 drivers/block/drbd/drbd_vli.h                      |  351 ++
 drivers/block/drbd/drbd_worker.c                   | 1529 +++++++
 drivers/block/drbd/drbd_wrappers.h                 |   91 +
 include/linux/drbd.h                               |  349 ++
 include/linux/drbd_limits.h                        |  137 +
 include/linux/drbd_nl.h                            |  137 +
 include/linux/drbd_tag_magic.h                     |   83 +
 include/linux/lru_cache.h                          |  294 ++
 lib/Kconfig                                        |    3 +
 lib/Makefile                                       |    2 +
 lib/lru_cache.c                                    |  560 +++
 35 files changed, 23140 insertions(+)
 create mode 100644 Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg
 create mode 100644 Documentation/blockdev/drbd/DRBD-data-packets.svg
 create mode 100644 Documentation/blockdev/drbd/README.txt
 create mode 100644 Documentation/blockdev/drbd/conn-states-8.dot
 create mode 100644 Documentation/blockdev/drbd/disk-states-8.dot
 create mode 100644 Documentation/blockdev/drbd/drbd-connection-state-overview.dot
 create mode 100644 Documentation/blockdev/drbd/node-states-8.dot
 create mode 100644 drivers/block/drbd/Kconfig
 create mode 100644 drivers/block/drbd/Makefile
 create mode 100644 drivers/block/drbd/drbd_actlog.c
 create mode 100644 drivers/block/drbd/drbd_bitmap.c
 create mode 100644 drivers/block/drbd/drbd_int.h
 create mode 100644 drivers/block/drbd/drbd_main.c
 create mode 100644 drivers/block/drbd/drbd_nl.c
 create mode 100644 drivers/block/drbd/drbd_proc.c
 create mode 100644 drivers/block/drbd/drbd_receiver.c
 create mode 100644 drivers/block/drbd/drbd_req.c
 create mode 100644 drivers/block/drbd/drbd_req.h
 create mode 100644 drivers/block/drbd/drbd_strings.c
 create mode 100644 drivers/block/drbd/drbd_tracing.c
 create mode 100644 drivers/block/drbd/drbd_tracing.h
 create mode 100644 drivers/block/drbd/drbd_vli.h
 create mode 100644 drivers/block/drbd/drbd_worker.c
 create mode 100644 drivers/block/drbd/drbd_wrappers.h
 create mode 100644 include/linux/drbd.h
 create mode 100644 include/linux/drbd_limits.h
 create mode 100644 include/linux/drbd_nl.h
 create mode 100644 include/linux/drbd_tag_magic.h
 create mode 100644 include/linux/lru_cache.h
 create mode 100644 lib/lru_cache.c

(limited to 'include/linux')

diff --git a/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg b/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg
new file mode 100644
index 000000000000..f87cfa0dc2fb
--- /dev/null
+++ b/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg
@@ -0,0 +1,588 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+<svg
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   version="1.0"
+   width="210mm"
+   height="297mm"
+   viewBox="0 0 21000 29700"
+   id="svg2"
+   style="fill-rule:evenodd">
+  <defs
+     id="defs4" />
+  <g
+     id="Default"
+     style="visibility:visible">
+    <desc
+       id="desc180">Master slide</desc>
+  </g>
+  <path
+     d="M 11999,8601 L 11899,8301 L 12099,8301 L 11999,8601 z"
+     id="path193"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 11999,7801 L 11999,8361"
+     id="path197"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <path
+     d="M 7999,10401 L 7899,10101 L 8099,10101 L 7999,10401 z"
+     id="path209"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 7999,9601 L 7999,10161"
+     id="path213"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <path
+     d="M 11999,7801 L 11685,7840 L 11724,7644 L 11999,7801 z"
+     id="path225"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 7999,7001 L 11764,7754"
+     id="path229"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <g
+     transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,-1244.4792,1416.5139)"
+     id="g245"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <text
+       id="text247">
+      <tspan
+         x="9139 9368 9579 9808 9986 10075 10252 10481 10659 10837 10909"
+         y="9284"
+         id="tspan249">RSDataReply</tspan>
+    </text>
+  </g>
+  <path
+     d="M 7999,9601 L 8281,9458 L 8311,9655 L 7999,9601 z"
+     id="path259"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 11999,9001 L 8236,9565"
+     id="path263"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <g
+     transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,1620.9382,-1639.4947)"
+     id="g279"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <text
+       id="text281">
+      <tspan
+         x="8743 8972 9132 9310 9573 9801 10013 10242 10419 10597 10775 10953 11114"
+         y="7023"
+         id="tspan283">CsumRSRequest</tspan>
+    </text>
+  </g>
+  <text
+     id="text297"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4034 4263 4440 4703 4881 5042 5219 5397 5503 5681 5842 6003 6180 6341 6519 6625 6803 6980 7158 7336 7497 7586 7692"
+       y="5707"
+       id="tspan299">w_make_resync_request()</tspan>
+  </text>
+  <text
+     id="text313"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12199 12305 12483 12644 12821 12893 13054 13232 13410 13638 13816 13905 14083 14311 14489 14667 14845 15023 15184 15272 15378"
+       y="7806"
+       id="tspan315">receive_DataRequest()</tspan>
+  </text>
+  <text
+     id="text329"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12199 12377 12483 12660 12838 13016 13194 13372 13549 13621 13799 13977 14083 14261 14438 14616 14794 14955 15133 15294 15399"
+       y="8606"
+       id="tspan331">drbd_endio_read_sec()</tspan>
+  </text>
+  <text
+     id="text345"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12191 12420 12597 12775 12953 13131 13309 13486 13664 13825 13986 14164 14426 14604 14710 14871 15049 15154 15332 15510 15616"
+       y="9007"
+       id="tspan347">w_e_end_csum_rs_req()</tspan>
+  </text>
+  <text
+     id="text361"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4444 4550 4728 4889 5066 5138 5299 5477 5655 5883 6095 6324 6501 6590 6768 6997 7175 7352 7424 7585 7691"
+       y="9507"
+       id="tspan363">receive_RSDataReply()</tspan>
+  </text>
+  <text
+     id="text377"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4457 4635 4741 4918 5096 5274 5452 5630 5807 5879 6057 6235 6464 6569 6641 6730 6908 7086 7247 7425 7585 7691"
+       y="10407"
+       id="tspan379">drbd_endio_write_sec()</tspan>
+  </text>
+  <text
+     id="text393"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4647 4825 5003 5180 5358 5536 5714 5820 5997 6158 6319 6497 6658 6836 7013 7085 7263 7424 7585 7691"
+       y="10907"
+       id="tspan395">e_end_resync_block()</tspan>
+  </text>
+  <path
+     d="M 11999,11601 L 11685,11640 L 11724,11444 L 11999,11601 z"
+     id="path405"
+     style="fill:#000080;visibility:visible" />
+  <path
+     d="M 7999,10801 L 11764,11554"
+     id="path409"
+     style="fill:none;stroke:#000080;visibility:visible" />
+  <g
+     transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,2434.7562,-1674.649)"
+     id="g425"
+     style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
+    <text
+       id="text427">
+      <tspan
+         x="9320 9621 9726 9798 9887 10065 10277 10438"
+         y="10943"
+         id="tspan429">WriteAck</tspan>
+    </text>
+  </g>
+  <text
+     id="text443"
+     style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12199 12377 12555 12644 12821 13033 13105 13283 13444 13604 13816 13977 14138 14244"
+       y="11559"
+       id="tspan445">got_BlockAck()</tspan>
+  </text>
+  <text
+     id="text459"
+     style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="7999 8304 8541 8778 8990 9201 9413 9650 10001 10120 10357 10594 10806 11043 11280 11398 11703 11940 12152 12364 12601 12812 12931 13049 13261 13498 13710 13947 14065 14302 14540 14658 14777 14870 15107 15225 15437 15649 15886"
+       y="4877"
+       id="tspan461">Checksum based Resync, case not in sync</tspan>
+  </text>
+  <text
+     id="text475"
+     style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="6961 7266 7571 7854 8159 8299 8536 8654 8891 9010 9247 9484 9603 9840 9958 10077 10170 10407"
+       y="2806"
+       id="tspan477">DRBD-8.3 data flow</tspan>
+  </text>
+  <text
+     id="text491"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="5190 5419 5596 5774 5952 6113 6291 6468 6646 6824 6985 7146 7324 7586 7692"
+       y="7005"
+       id="tspan493">w_e_send_csum()</tspan>
+  </text>
+  <path
+     d="M 11999,17601 L 11899,17301 L 12099,17301 L 11999,17601 z"
+     id="path503"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 11999,16801 L 11999,17361"
+     id="path507"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <path
+     d="M 11999,16801 L 11685,16840 L 11724,16644 L 11999,16801 z"
+     id="path519"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 7999,16001 L 11764,16754"
+     id="path523"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <g
+     transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,-2539.5806,1529.3491)"
+     id="g539"
+     style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
+    <text
+       id="text541">
+      <tspan
+         x="9269 9498 9709 9798 9959 10048 10226 10437 10598 10776"
+         y="18265"
+         id="tspan543">RSIsInSync</tspan>
+    </text>
+  </g>
+  <path
+     d="M 7999,18601 L 8281,18458 L 8311,18655 L 7999,18601 z"
+     id="path553"
+     style="fill:#000080;visibility:visible" />
+  <path
+     d="M 11999,18001 L 8236,18565"
+     id="path557"
+     style="fill:none;stroke:#000080;visibility:visible" />
+  <g
+     transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,3461.4027,-1449.3012)"
+     id="g573"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <text
+       id="text575">
+      <tspan
+         x="8743 8972 9132 9310 9573 9801 10013 10242 10419 10597 10775 10953 11114"
+         y="16023"
+         id="tspan577">CsumRSRequest</tspan>
+    </text>
+  </g>
+  <text
+     id="text591"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12199 12305 12483 12644 12821 12893 13054 13232 13410 13638 13816 13905 14083 14311 14489 14667 14845 15023 15184 15272 15378"
+       y="16806"
+       id="tspan593">receive_DataRequest()</tspan>
+  </text>
+  <text
+     id="text607"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12199 12377 12483 12660 12838 13016 13194 13372 13549 13621 13799 13977 14083 14261 14438 14616 14794 14955 15133 15294 15399"
+       y="17606"
+       id="tspan609">drbd_endio_read_sec()</tspan>
+  </text>
+  <text
+     id="text623"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12191 12420 12597 12775 12953 13131 13309 13486 13664 13825 13986 14164 14426 14604 14710 14871 15049 15154 15332 15510 15616"
+       y="18007"
+       id="tspan625">w_e_end_csum_rs_req()</tspan>
+  </text>
+  <text
+     id="text639"
+     style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="5735 5913 6091 6180 6357 6446 6607 6696 6874 7085 7246 7424 7585 7691"
+       y="18507"
+       id="tspan641">got_IsInSync()</tspan>
+  </text>
+  <text
+     id="text655"
+     style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="7999 8304 8541 8778 8990 9201 9413 9650 10001 10120 10357 10594 10806 11043 11280 11398 11703 11940 12152 12364 12601 12812 12931 13049 13261 13498 13710 13947 14065 14159 14396 14514 14726 14937 15175"
+       y="13877"
+       id="tspan657">Checksum based Resync, case in sync</tspan>
+  </text>
+  <path
+     d="M 12000,24601 L 11900,24301 L 12100,24301 L 12000,24601 z"
+     id="path667"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 12000,23801 L 12000,24361"
+     id="path671"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <path
+     d="M 8000,26401 L 7900,26101 L 8100,26101 L 8000,26401 z"
+     id="path683"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 8000,25601 L 8000,26161"
+     id="path687"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <path
+     d="M 12000,23801 L 11686,23840 L 11725,23644 L 12000,23801 z"
+     id="path699"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 8000,23001 L 11765,23754"
+     id="path703"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <g
+     transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,-3543.8452,1630.5143)"
+     id="g719"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <text
+       id="text721">
+      <tspan
+         x="9464 9710 9921 10150 10328 10505 10577"
+         y="25236"
+         id="tspan723">OVReply</tspan>
+    </text>
+  </g>
+  <path
+     d="M 8000,25601 L 8282,25458 L 8312,25655 L 8000,25601 z"
+     id="path733"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 12000,25001 L 8237,25565"
+     id="path737"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <g
+     transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,4918.2801,-1381.2128)"
+     id="g753"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <text
+       id="text755">
+      <tspan
+         x="9142 9388 9599 9828 10006 10183 10361 10539 10700"
+         y="23106"
+         id="tspan757">OVRequest</tspan>
+    </text>
+  </g>
+  <text
+     id="text771"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12200 12306 12484 12645 12822 12894 13055 13233 13411 13656 13868 14097 14274 14452 14630 14808 14969 15058 15163"
+       y="23806"
+       id="tspan773">receive_OVRequest()</tspan>
+  </text>
+  <text
+     id="text787"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12200 12378 12484 12661 12839 13017 13195 13373 13550 13622 13800 13978 14084 14262 14439 14617 14795 14956 15134 15295 15400"
+       y="24606"
+       id="tspan789">drbd_endio_read_sec()</tspan>
+  </text>
+  <text
+     id="text803"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12192 12421 12598 12776 12954 13132 13310 13487 13665 13843 14004 14182 14288 14465 14643 14749"
+       y="25007"
+       id="tspan805">w_e_end_ov_req()</tspan>
+  </text>
+  <text
+     id="text819"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="5101 5207 5385 5546 5723 5795 5956 6134 6312 6557 6769 6998 7175 7353 7425 7586 7692"
+       y="25507"
+       id="tspan821">receive_OVReply()</tspan>
+  </text>
+  <text
+     id="text835"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4492 4670 4776 4953 5131 5309 5487 5665 5842 5914 6092 6270 6376 6554 6731 6909 7087 7248 7426 7587 7692"
+       y="26407"
+       id="tspan837">drbd_endio_read_sec()</tspan>
+  </text>
+  <text
+     id="text851"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4902 5131 5308 5486 5664 5842 6020 6197 6375 6553 6714 6892 6998 7175 7353 7425 7586 7692"
+       y="26907"
+       id="tspan853">w_e_end_ov_reply()</tspan>
+  </text>
+  <path
+     d="M 12000,27601 L 11686,27640 L 11725,27444 L 12000,27601 z"
+     id="path863"
+     style="fill:#000080;visibility:visible" />
+  <path
+     d="M 8000,26801 L 11765,27554"
+     id="path867"
+     style="fill:none;stroke:#000080;visibility:visible" />
+  <g
+     transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,5704.1907,-1328.312)"
+     id="g883"
+     style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
+    <text
+       id="text885">
+      <tspan
+         x="9279 9525 9736 9965 10143 10303 10481 10553"
+         y="26935"
+         id="tspan887">OVResult</tspan>
+    </text>
+  </g>
+  <text
+     id="text901"
+     style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12200 12378 12556 12645 12822 13068 13280 13508 13686 13847 14025 14097 14185 14291"
+       y="27559"
+       id="tspan903">got_OVResult()</tspan>
+  </text>
+  <text
+     id="text917"
+     style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="8000 8330 8567 8660 8754 8991 9228 9346 9558 9795 9935 10028 10146"
+       y="21877"
+       id="tspan919">Online verify</tspan>
+  </text>
+  <text
+     id="text933"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4641 4870 5047 5310 5488 5649 5826 6004 6182 6343 6521 6626 6804 6982 7160 7338 7499 7587 7693"
+       y="23005"
+       id="tspan935">w_make_ov_request()</tspan>
+  </text>
+  <path
+     d="M 8000,6500 L 7900,6200 L 8100,6200 L 8000,6500 z"
+     id="path945"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 8000,5700 L 8000,6260"
+     id="path949"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <path
+     d="M 3900,5500 L 3700,5500 L 3700,11000 L 3900,11000"
+     id="path961"
+     style="fill:none;stroke:#000000;visibility:visible" />
+  <path
+     d="M 3900,14500 L 3700,14500 L 3700,18600 L 3900,18600"
+     id="path973"
+     style="fill:none;stroke:#000000;visibility:visible" />
+  <path
+     d="M 3900,22800 L 3700,22800 L 3700,26900 L 3900,26900"
+     id="path985"
+     style="fill:none;stroke:#000000;visibility:visible" />
+  <text
+     id="text1001"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4492 4670 4776 4953 5131 5309 5487 5665 5842 5914 6092 6270 6376 6554 6731 6909 7087 7248 7426 7587 7692"
+       y="6506"
+       id="tspan1003">drbd_endio_read_sec()</tspan>
+  </text>
+  <text
+     id="text1017"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4034 4263 4440 4703 4881 5042 5219 5397 5503 5681 5842 6003 6180 6341 6519 6625 6803 6980 7158 7336 7497 7586 7692"
+       y="14708"
+       id="tspan1019">w_make_resync_request()</tspan>
+  </text>
+  <text
+     id="text1033"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="5190 5419 5596 5774 5952 6113 6291 6468 6646 6824 6985 7146 7324 7586 7692"
+       y="16006"
+       id="tspan1035">w_e_send_csum()</tspan>
+  </text>
+  <path
+     d="M 8000,15501 L 7900,15201 L 8100,15201 L 8000,15501 z"
+     id="path1045"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 8000,14701 L 8000,15261"
+     id="path1049"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <text
+     id="text1065"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4492 4670 4776 4953 5131 5309 5487 5665 5842 5914 6092 6270 6376 6554 6731 6909 7087 7248 7426 7587 7692"
+       y="15507"
+       id="tspan1067">drbd_endio_read_sec()</tspan>
+  </text>
+  <path
+     d="M 16100,9000 L 16300,9000 L 16300,7500 L 16100,7500"
+     id="path1077"
+     style="fill:none;stroke:#000000;visibility:visible" />
+  <path
+     d="M 16100,18000 L 16300,18000 L 16300,16500 L 16100,16500"
+     id="path1089"
+     style="fill:none;stroke:#000000;visibility:visible" />
+  <path
+     d="M 16100,25000 L 16300,25000 L 16300,23500 L 16100,23500"
+     id="path1101"
+     style="fill:none;stroke:#000000;visibility:visible" />
+  <text
+     id="text1117"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="2026 2132 2293 2471 2648 2826 3004 3076 3254 3431 3503 3681 3787"
+       y="5402"
+       id="tspan1119">rs_begin_io()</tspan>
+  </text>
+  <text
+     id="text1133"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="2027 2133 2294 2472 2649 2827 3005 3077 3255 3432 3504 3682 3788"
+       y="14402"
+       id="tspan1135">rs_begin_io()</tspan>
+  </text>
+  <text
+     id="text1149"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="2026 2132 2293 2471 2648 2826 3004 3076 3254 3431 3503 3681 3787"
+       y="22602"
+       id="tspan1151">rs_begin_io()</tspan>
+  </text>
+  <text
+     id="text1165"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="1426 1532 1693 1871 2031 2209 2472 2649 2721 2899 2988 3166 3344 3416 3593 3699"
+       y="11302"
+       id="tspan1167">rs_complete_io()</tspan>
+  </text>
+  <text
+     id="text1181"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="1526 1632 1793 1971 2131 2309 2572 2749 2821 2999 3088 3266 3444 3516 3693 3799"
+       y="18931"
+       id="tspan1183">rs_complete_io()</tspan>
+  </text>
+  <text
+     id="text1197"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="1526 1632 1793 1971 2131 2309 2572 2749 2821 2999 3088 3266 3444 3516 3693 3799"
+       y="27231"
+       id="tspan1199">rs_complete_io()</tspan>
+  </text>
+  <text
+     id="text1213"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="16126 16232 16393 16571 16748 16926 17104 17176 17354 17531 17603 17781 17887"
+       y="7402"
+       id="tspan1215">rs_begin_io()</tspan>
+  </text>
+  <text
+     id="text1229"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="16127 16233 16394 16572 16749 16927 17105 17177 17355 17532 17604 17782 17888"
+       y="16331"
+       id="tspan1231">rs_begin_io()</tspan>
+  </text>
+  <text
+     id="text1245"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="16127 16233 16394 16572 16749 16927 17105 17177 17355 17532 17604 17782 17888"
+       y="23302"
+       id="tspan1247">rs_begin_io()</tspan>
+  </text>
+  <text
+     id="text1261"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="16115 16221 16382 16560 16720 16898 17161 17338 17410 17588 17677 17855 18033 18105 18282 18388"
+       y="9302"
+       id="tspan1263">rs_complete_io()</tspan>
+  </text>
+  <text
+     id="text1277"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="16115 16221 16382 16560 16720 16898 17161 17338 17410 17588 17677 17855 18033 18105 18282 18388"
+       y="18331"
+       id="tspan1279">rs_complete_io()</tspan>
+  </text>
+  <text
+     id="text1293"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="16126 16232 16393 16571 16731 16909 17172 17349 17421 17599 17688 17866 18044 18116 18293 18399"
+       y="25302"
+       id="tspan1295">rs_complete_io()</tspan>
+  </text>
+</svg>
diff --git a/Documentation/blockdev/drbd/DRBD-data-packets.svg b/Documentation/blockdev/drbd/DRBD-data-packets.svg
new file mode 100644
index 000000000000..48a1e2165fec
--- /dev/null
+++ b/Documentation/blockdev/drbd/DRBD-data-packets.svg
@@ -0,0 +1,459 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+<svg
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   version="1.0"
+   width="210mm"
+   height="297mm"
+   viewBox="0 0 21000 29700"
+   id="svg2"
+   style="fill-rule:evenodd">
+  <defs
+     id="defs4" />
+  <g
+     id="Default"
+     style="visibility:visible">
+    <desc
+       id="desc176">Master slide</desc>
+  </g>
+  <path
+     d="M 11999,19601 L 11899,19301 L 12099,19301 L 11999,19601 z"
+     id="path189"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 11999,18801 L 11999,19361"
+     id="path193"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <path
+     d="M 7999,21401 L 7899,21101 L 8099,21101 L 7999,21401 z"
+     id="path205"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 7999,20601 L 7999,21161"
+     id="path209"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <path
+     d="M 11999,18801 L 11685,18840 L 11724,18644 L 11999,18801 z"
+     id="path221"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 7999,18001 L 11764,18754"
+     id="path225"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <text
+     x="-3023.845"
+     y="1106.8124"
+     transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,0,0)"
+     id="text243"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="6115.1553 6344.1553 6555.1553 6784.1553 6962.1553 7051.1553 7228.1553 7457.1553 7635.1553 7813.1553 7885.1553"
+       y="21390.812"
+       id="tspan245">RSDataReply</tspan>
+  </text>
+  <path
+     d="M 7999,20601 L 8281,20458 L 8311,20655 L 7999,20601 z"
+     id="path255"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 11999,20001 L 8236,20565"
+     id="path259"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <text
+     x="3502.5356"
+     y="-2184.6621"
+     transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
+     id="text277"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12321.536 12550.536 12761.536 12990.536 13168.536 13257.536 13434.536 13663.536 13841.536 14019.536 14196.536 14374.536 14535.536"
+       y="15854.338"
+       id="tspan279">RSDataRequest</tspan>
+  </text>
+  <text
+     id="text293"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4034 4263 4440 4703 4881 5042 5219 5397 5503 5681 5842 6003 6180 6341 6519 6625 6803 6980 7158 7336 7497 7586 7692"
+       y="17807"
+       id="tspan295">w_make_resync_request()</tspan>
+  </text>
+  <text
+     id="text309"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12199 12305 12483 12644 12821 12893 13054 13232 13410 13638 13816 13905 14083 14311 14489 14667 14845 15023 15184 15272 15378"
+       y="18806"
+       id="tspan311">receive_DataRequest()</tspan>
+  </text>
+  <text
+     id="text325"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12199 12377 12483 12660 12838 13016 13194 13372 13549 13621 13799 13977 14083 14261 14438 14616 14794 14955 15133 15294 15399"
+       y="19606"
+       id="tspan327">drbd_endio_read_sec()</tspan>
+  </text>
+  <text
+     id="text341"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12191 12420 12597 12775 12953 13131 13309 13486 13664 13770 13931 14109 14287 14375 14553 14731 14837 15015 15192 15298"
+       y="20007"
+       id="tspan343">w_e_end_rsdata_req()</tspan>
+  </text>
+  <text
+     id="text357"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4444 4550 4728 4889 5066 5138 5299 5477 5655 5883 6095 6324 6501 6590 6768 6997 7175 7352 7424 7585 7691"
+       y="20507"
+       id="tspan359">receive_RSDataReply()</tspan>
+  </text>
+  <text
+     id="text373"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4457 4635 4741 4918 5096 5274 5452 5630 5807 5879 6057 6235 6464 6569 6641 6730 6908 7086 7247 7425 7585 7691"
+       y="21407"
+       id="tspan375">drbd_endio_write_sec()</tspan>
+  </text>
+  <text
+     id="text389"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4647 4825 5003 5180 5358 5536 5714 5820 5997 6158 6319 6497 6658 6836 7013 7085 7263 7424 7585 7691"
+       y="21907"
+       id="tspan391">e_end_resync_block()</tspan>
+  </text>
+  <path
+     d="M 11999,22601 L 11685,22640 L 11724,22444 L 11999,22601 z"
+     id="path401"
+     style="fill:#000080;visibility:visible" />
+  <path
+     d="M 7999,21801 L 11764,22554"
+     id="path405"
+     style="fill:none;stroke:#000080;visibility:visible" />
+  <text
+     x="4290.3008"
+     y="-2369.6162"
+     transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
+     id="text423"
+     style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="13610.301 13911.301 14016.301 14088.301 14177.301 14355.301 14567.301 14728.301"
+       y="19573.385"
+       id="tspan425">WriteAck</tspan>
+  </text>
+  <text
+     id="text439"
+     style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12199 12377 12555 12644 12821 13033 13105 13283 13444 13604 13816 13977 14138 14244"
+       y="22559"
+       id="tspan441">got_BlockAck()</tspan>
+  </text>
+  <text
+     id="text455"
+     style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="7999 8304 8541 8753 8964 9201 9413 9531 9769 9862 10099 10310 10522 10734 10852 10971 11208 11348 11585 11822"
+       y="16877"
+       id="tspan457">Resync blocks, 4-32K</tspan>
+  </text>
+  <path
+     d="M 12000,7601 L 11900,7301 L 12100,7301 L 12000,7601 z"
+     id="path467"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 12000,6801 L 12000,7361"
+     id="path471"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <path
+     d="M 12000,6801 L 11686,6840 L 11725,6644 L 12000,6801 z"
+     id="path483"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 8000,6001 L 11765,6754"
+     id="path487"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <text
+     x="-1288.1796"
+     y="1279.7666"
+     transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,0,0)"
+     id="text505"
+     style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="8174.8208 8475.8203 8580.8203 8652.8203 8741.8203 8919.8203 9131.8203 9292.8203"
+       y="9516.7666"
+       id="tspan507">WriteAck</tspan>
+  </text>
+  <path
+     d="M 8000,8601 L 8282,8458 L 8312,8655 L 8000,8601 z"
+     id="path517"
+     style="fill:#000080;visibility:visible" />
+  <path
+     d="M 12000,8001 L 8237,8565"
+     id="path521"
+     style="fill:none;stroke:#000080;visibility:visible" />
+  <text
+     x="1065.6655"
+     y="-2097.7664"
+     transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
+     id="text539"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="10682.666 10911.666 11088.666 11177.666"
+       y="4107.2339"
+       id="tspan541">Data</tspan>
+  </text>
+  <text
+     id="text555"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4746 4924 5030 5207 5385 5563 5826 6003 6164 6342 6520 6626 6803 6981 7159 7337 7498 7587 7692"
+       y="5505"
+       id="tspan557">drbd_make_request()</tspan>
+  </text>
+  <text
+     id="text571"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12200 12306 12484 12645 12822 12894 13055 13233 13411 13639 13817 13906 14084 14190"
+       y="6806"
+       id="tspan573">receive_Data()</tspan>
+  </text>
+  <text
+     id="text587"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12200 12378 12484 12661 12839 13017 13195 13373 13550 13622 13800 13978 14207 14312 14384 14473 14651 14829 14990 15168 15328 15434"
+       y="7606"
+       id="tspan589">drbd_endio_write_sec()</tspan>
+  </text>
+  <text
+     id="text603"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12192 12370 12548 12725 12903 13081 13259 13437 13509 13686 13847 14008 14114"
+       y="8007"
+       id="tspan605">e_end_block()</tspan>
+  </text>
+  <text
+     id="text619"
+     style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="5647 5825 6003 6092 6269 6481 6553 6731 6892 7052 7264 7425 7586 7692"
+       y="8606"
+       id="tspan621">got_BlockAck()</tspan>
+  </text>
+  <text
+     id="text635"
+     style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="8000 8305 8542 8779 9016 9109 9346 9486 9604 9956 10049 10189 10328 10565 10705 10942 11179 11298 11603 11742 11835 11954 12191 12310 12428 12665 12902 13139 13279 13516 13753"
+       y="4877"
+       id="tspan637">Regular mirrored write, 512-32K</tspan>
+  </text>
+  <text
+     id="text651"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="5381 5610 5787 5948 6126 6304 6482 6659 6837 7015 7087 7265 7426 7587 7692"
+       y="6003"
+       id="tspan653">w_send_dblock()</tspan>
+  </text>
+  <path
+     d="M 8000,6800 L 7900,6500 L 8100,6500 L 8000,6800 z"
+     id="path663"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 8000,6000 L 8000,6560"
+     id="path667"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <text
+     id="text683"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4602 4780 4886 5063 5241 5419 5597 5775 5952 6024 6202 6380 6609 6714 6786 6875 7053 7231 7409 7515 7587 7692"
+       y="6905"
+       id="tspan685">drbd_endio_write_pri()</tspan>
+  </text>
+  <path
+     d="M 12000,13602 L 11900,13302 L 12100,13302 L 12000,13602 z"
+     id="path695"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 12000,12802 L 12000,13362"
+     id="path699"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <path
+     d="M 12000,12802 L 11686,12841 L 11725,12645 L 12000,12802 z"
+     id="path711"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 8000,12002 L 11765,12755"
+     id="path715"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <text
+     x="-2155.5266"
+     y="1201.5964"
+     transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,0,0)"
+     id="text733"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="7202.4736 7431.4736 7608.4736 7697.4736 7875.4736 8104.4736 8282.4736 8459.4736 8531.4736"
+       y="15454.597"
+       id="tspan735">DataReply</tspan>
+  </text>
+  <path
+     d="M 8000,14602 L 8282,14459 L 8312,14656 L 8000,14602 z"
+     id="path745"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 12000,14002 L 8237,14566"
+     id="path749"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <text
+     x="2280.3804"
+     y="-2103.2141"
+     transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
+     id="text767"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="11316.381 11545.381 11722.381 11811.381 11989.381 12218.381 12396.381 12573.381 12751.381 12929.381 13090.381"
+       y="9981.7861"
+       id="tspan769">DataRequest</tspan>
+  </text>
+  <text
+     id="text783"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4746 4924 5030 5207 5385 5563 5826 6003 6164 6342 6520 6626 6803 6981 7159 7337 7498 7587 7692"
+       y="11506"
+       id="tspan785">drbd_make_request()</tspan>
+  </text>
+  <text
+     id="text799"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12200 12306 12484 12645 12822 12894 13055 13233 13411 13639 13817 13906 14084 14312 14490 14668 14846 15024 15185 15273 15379"
+       y="12807"
+       id="tspan801">receive_DataRequest()</tspan>
+  </text>
+  <text
+     id="text815"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12200 12378 12484 12661 12839 13017 13195 13373 13550 13622 13800 13978 14084 14262 14439 14617 14795 14956 15134 15295 15400"
+       y="13607"
+       id="tspan817">drbd_endio_read_sec()</tspan>
+  </text>
+  <text
+     id="text831"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12192 12421 12598 12776 12954 13132 13310 13487 13665 13843 14021 14110 14288 14465 14571 14749 14927 15033"
+       y="14008"
+       id="tspan833">w_e_end_data_req()</tspan>
+  </text>
+  <g
+     id="g835"
+     style="visibility:visible">
+    <desc
+       id="desc837">Drawing</desc>
+    <text
+       id="text847"
+       style="font-size:318px;font-weight:400;fill:#008000;font-family:Helvetica embedded">
+      <tspan
+         x="4885 4991 5169 5330 5507 5579 5740 5918 6096 6324 6502 6591 6769 6997 7175 7353 7425 7586 7692"
+         y="14607"
+         id="tspan849">receive_DataReply()</tspan>
+    </text>
+  </g>
+  <text
+     id="text863"
+     style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="8000 8305 8398 8610 8821 8914 9151 9363 9575 9693 9833 10070 10307 10544 10663 10781 11018 11255 11493 11632 11869 12106"
+       y="10878"
+       id="tspan865">Diskless read, 512-32K</tspan>
+  </text>
+  <text
+     id="text879"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="5029 5258 5435 5596 5774 5952 6130 6307 6413 6591 6769 6947 7125 7230 7408 7586 7692"
+       y="12004"
+       id="tspan881">w_send_read_req()</tspan>
+  </text>
+  <text
+     id="text895"
+     style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="6961 7266 7571 7854 8159 8278 8515 8633 8870 9107 9226 9463 9581 9700 9793 10030"
+       y="2806"
+       id="tspan897">DRBD 8 data flow</tspan>
+  </text>
+  <path
+     d="M 3900,5300 L 3700,5300 L 3700,7000 L 3900,7000"
+     id="path907"
+     style="fill:none;stroke:#000000;visibility:visible" />
+  <path
+     d="M 3900,17600 L 3700,17600 L 3700,22000 L 3900,22000"
+     id="path919"
+     style="fill:none;stroke:#000000;visibility:visible" />
+  <path
+     d="M 16100,20000 L 16300,20000 L 16300,18500 L 16100,18500"
+     id="path931"
+     style="fill:none;stroke:#000000;visibility:visible" />
+  <text
+     id="text947"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="2126 2304 2376 2554 2731 2909 3087 3159 3337 3515 3587 3764 3870"
+       y="5202"
+       id="tspan949">al_begin_io()</tspan>
+  </text>
+  <text
+     id="text963"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="1632 1810 1882 2060 2220 2398 2661 2839 2910 3088 3177 3355 3533 3605 3783 3888"
+       y="7331"
+       id="tspan965">al_complete_io()</tspan>
+  </text>
+  <text
+     id="text979"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="2126 2232 2393 2571 2748 2926 3104 3176 3354 3531 3603 3781 3887"
+       y="17431"
+       id="tspan981">rs_begin_io()</tspan>
+  </text>
+  <text
+     id="text995"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="1626 1732 1893 2071 2231 2409 2672 2849 2921 3099 3188 3366 3544 3616 3793 3899"
+       y="22331"
+       id="tspan997">rs_complete_io()</tspan>
+  </text>
+  <text
+     id="text1011"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="16027 16133 16294 16472 16649 16827 17005 17077 17255 17432 17504 17682 17788"
+       y="18402"
+       id="tspan1013">rs_begin_io()</tspan>
+  </text>
+  <text
+     id="text1027"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="16115 16221 16382 16560 16720 16898 17161 17338 17410 17588 17677 17855 18033 18105 18282 18388"
+       y="20331"
+       id="tspan1029">rs_complete_io()</tspan>
+  </text>
+</svg>
diff --git a/Documentation/blockdev/drbd/README.txt b/Documentation/blockdev/drbd/README.txt
new file mode 100644
index 000000000000..627b0a1bf35e
--- /dev/null
+++ b/Documentation/blockdev/drbd/README.txt
@@ -0,0 +1,16 @@
+Description
+
+  DRBD is a shared-nothing, synchronously replicated block device. It
+  is designed to serve as a building block for high availability
+  clusters and in this context, is a "drop-in" replacement for shared
+  storage. Simplistically, you could see it as a network RAID 1.
+
+  Please visit http://www.drbd.org to find out more.
+
+The here included files are intended to help understand the implementation
+
+DRBD-8.3-data-packets.svg, DRBD-data-packets.svg  
+  relates some functions, and write packets.
+
+conn-states-8.dot, disk-states-8.dot, node-states-8.dot
+  The sub graphs of DRBD's state transitions
diff --git a/Documentation/blockdev/drbd/conn-states-8.dot b/Documentation/blockdev/drbd/conn-states-8.dot
new file mode 100644
index 000000000000..025e8cf5e64a
--- /dev/null
+++ b/Documentation/blockdev/drbd/conn-states-8.dot
@@ -0,0 +1,18 @@
+digraph conn_states {
+	StandAllone  -> WFConnection   [ label = "ioctl_set_net()" ]
+	WFConnection -> Unconnected    [ label = "unable to bind()" ]
+	WFConnection -> WFReportParams [ label = "in connect() after accept" ]
+	WFReportParams -> StandAllone  [ label = "checks in receive_param()" ]
+	WFReportParams -> Connected    [ label = "in receive_param()" ]
+	WFReportParams -> WFBitMapS    [ label = "sync_handshake()" ]
+	WFReportParams -> WFBitMapT    [ label = "sync_handshake()" ]
+	WFBitMapS -> SyncSource        [ label = "receive_bitmap()" ]
+	WFBitMapT -> SyncTarget        [ label = "receive_bitmap()" ]
+	SyncSource -> Connected
+	SyncTarget -> Connected
+	SyncSource -> PausedSyncS
+	SyncTarget -> PausedSyncT
+	PausedSyncS -> SyncSource
+	PausedSyncT -> SyncTarget
+	Connected   -> WFConnection    [ label = "* on network error" ]
+}
diff --git a/Documentation/blockdev/drbd/disk-states-8.dot b/Documentation/blockdev/drbd/disk-states-8.dot
new file mode 100644
index 000000000000..d06cfb46fb98
--- /dev/null
+++ b/Documentation/blockdev/drbd/disk-states-8.dot
@@ -0,0 +1,16 @@
+digraph disk_states {
+	Diskless -> Inconsistent       [ label = "ioctl_set_disk()" ]
+	Diskless -> Consistent         [ label = "ioctl_set_disk()" ]
+	Diskless -> Outdated           [ label = "ioctl_set_disk()" ]
+	Consistent -> Outdated         [ label = "receive_param()" ]
+	Consistent -> UpToDate         [ label = "receive_param()" ]
+	Consistent -> Inconsistent     [ label = "start resync" ]
+	Outdated   -> Inconsistent     [ label = "start resync" ]
+	UpToDate   -> Inconsistent     [ label = "ioctl_replicate" ]
+	Inconsistent -> UpToDate       [ label = "resync completed" ]
+	Consistent -> Failed           [ label = "io completion error" ]
+	Outdated   -> Failed           [ label = "io completion error" ]
+	UpToDate   -> Failed           [ label = "io completion error" ]
+	Inconsistent -> Failed         [ label = "io completion error" ]
+	Failed -> Diskless             [ label = "sending notify to peer" ]
+}
diff --git a/Documentation/blockdev/drbd/drbd-connection-state-overview.dot b/Documentation/blockdev/drbd/drbd-connection-state-overview.dot
new file mode 100644
index 000000000000..6d9cf0a7b11d
--- /dev/null
+++ b/Documentation/blockdev/drbd/drbd-connection-state-overview.dot
@@ -0,0 +1,85 @@
+// vim: set sw=2 sts=2 :
+digraph {
+  rankdir=BT
+  bgcolor=white
+
+  node [shape=plaintext]
+  node [fontcolor=black]
+
+  StandAlone     [ style=filled,fillcolor=gray,label=StandAlone ]
+
+  node [fontcolor=lightgray]
+
+  Unconnected    [ label=Unconnected ]
+
+  CommTrouble [ shape=record,
+    label="{communication loss|{Timeout|BrokenPipe|NetworkFailure}}" ]
+
+  node [fontcolor=gray]
+
+  subgraph cluster_try_connect {
+    label="try to connect, handshake"
+    rank=max
+    WFConnection   [ label=WFConnection ]
+    WFReportParams [ label=WFReportParams ]
+  }
+
+  TearDown       [ label=TearDown ]
+
+  Connected      [ label=Connected,style=filled,fillcolor=green,fontcolor=black ]
+
+  node [fontcolor=lightblue]
+
+  StartingSyncS  [ label=StartingSyncS ]
+  StartingSyncT  [ label=StartingSyncT ]
+
+  subgraph cluster_bitmap_exchange {
+    node [fontcolor=red]
+    fontcolor=red
+    label="new application (WRITE?) requests blocked\lwhile bitmap is exchanged"
+
+    WFBitMapT      [ label=WFBitMapT ]
+    WFSyncUUID     [ label=WFSyncUUID ]
+    WFBitMapS      [ label=WFBitMapS ]
+  }
+
+  node [fontcolor=blue]
+
+  cluster_resync [ shape=record,label="{<any>resynchronisation process running\l'concurrent' application requests allowed|{{<T>PausedSyncT\nSyncTarget}|{<S>PausedSyncS\nSyncSource}}}" ]
+
+  node [shape=box,fontcolor=black]
+
+  // drbdadm [label="drbdadm connect"]
+  // handshake [label="drbd_connect()\ndrbd_do_handshake\ndrbd_sync_handshake() etc."]
+  // comm_error [label="communication trouble"]
+
+  //
+  // edges
+  // --------------------------------------
+
+  StandAlone -> Unconnected [ label="drbdadm connect" ]
+  Unconnected -> StandAlone  [ label="drbdadm disconnect\lor serious communication trouble" ]
+  Unconnected -> WFConnection [ label="receiver thread is started" ]
+  WFConnection -> WFReportParams [ headlabel="accept()\land/or                        \lconnect()\l" ]
+
+  WFReportParams -> StandAlone [ label="during handshake\lpeers do not agree\labout something essential" ]
+  WFReportParams -> Connected [ label="data identical\lno sync needed",color=green,fontcolor=green ]
+
+    WFReportParams -> WFBitMapS
+    WFReportParams -> WFBitMapT
+    WFBitMapT -> WFSyncUUID [minlen=0.1,constraint=false]
+
+      WFBitMapS -> cluster_resync:S
+      WFSyncUUID -> cluster_resync:T
+
+  edge [color=green]
+  cluster_resync:any -> Connected [ label="resnyc done",fontcolor=green ]
+
+  edge [color=red]
+  WFReportParams -> CommTrouble
+  Connected -> CommTrouble
+  cluster_resync:any -> CommTrouble
+  edge [color=black]
+  CommTrouble -> Unconnected [label="receiver thread is stopped" ]
+
+}
diff --git a/Documentation/blockdev/drbd/node-states-8.dot b/Documentation/blockdev/drbd/node-states-8.dot
new file mode 100644
index 000000000000..4a2b00c23547
--- /dev/null
+++ b/Documentation/blockdev/drbd/node-states-8.dot
@@ -0,0 +1,14 @@
+digraph node_states {
+	Secondary -> Primary           [ label = "ioctl_set_state()" ]
+	Primary   -> Secondary 	       [ label = "ioctl_set_state()" ]
+}
+
+digraph peer_states {
+	Secondary -> Primary           [ label = "recv state packet" ]
+	Primary   -> Secondary 	       [ label = "recv state packet" ]
+	Primary   -> Unknown 	       [ label = "connection lost" ]
+	Secondary  -> Unknown  	       [ label = "connection lost" ]
+	Unknown   -> Primary           [ label = "connected" ]
+	Unknown   -> Secondary         [ label = "connected" ]
+}
+
diff --git a/MAINTAINERS b/MAINTAINERS
index c450f3abb8c9..ea56bd7a6cba 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1758,6 +1758,19 @@ S:	Maintained
 F:	drivers/scsi/dpt*
 F:	drivers/scsi/dpt/
 
+DRBD DRIVER
+P:     Philipp Reisner
+P:     Lars Ellenberg
+M:     drbd-dev@lists.linbit.com
+L:     drbd-user@lists.linbit.com
+W:     http://www.drbd.org
+T:     git git://git.drbd.org/linux-2.6-drbd.git drbd
+T:     git git://git.drbd.org/drbd-8.3.git
+S:     Supported
+F:     drivers/block/drbd/
+F:     lib/lru_cache.c
+F:     Documentation/blockdev/drbd/
+
 DRIVER CORE, KOBJECTS, AND SYSFS
 M:	Greg Kroah-Hartman <gregkh@suse.de>
 T:	quilt kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 1d886e079c58..77bfce52e9ca 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -271,6 +271,8 @@ config BLK_DEV_CRYPTOLOOP
 	  instead, which can be configured to be on-disk compatible with the
 	  cryptoloop device.
 
+source "drivers/block/drbd/Kconfig"
+
 config BLK_DEV_NBD
 	tristate "Network block device support"
 	depends on NET
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index cdaa3f8fddf0..aff5ac925c34 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -36,5 +36,6 @@ obj-$(CONFIG_BLK_DEV_UB)	+= ub.o
 obj-$(CONFIG_BLK_DEV_HD)	+= hd.o
 
 obj-$(CONFIG_XEN_BLKDEV_FRONTEND)	+= xen-blkfront.o
+obj-$(CONFIG_BLK_DEV_DRBD)     += drbd/
 
 swim_mod-objs	:= swim.o swim_asm.o
diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig
new file mode 100644
index 000000000000..4e6f90f487c2
--- /dev/null
+++ b/drivers/block/drbd/Kconfig
@@ -0,0 +1,82 @@
+#
+# DRBD device driver configuration
+#
+
+comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected"
+	depends on !PROC_FS || !INET || !CONNECTOR
+
+config BLK_DEV_DRBD
+	tristate "DRBD Distributed Replicated Block Device support"
+	depends on PROC_FS && INET && CONNECTOR
+	select LRU_CACHE
+	default n
+	help
+
+	  NOTE: In order to authenticate connections you have to select
+	  CRYPTO_HMAC and a hash function as well.
+
+	  DRBD is a shared-nothing, synchronously replicated block device. It
+	  is designed to serve as a building block for high availability
+	  clusters and in this context, is a "drop-in" replacement for shared
+	  storage. Simplistically, you could see it as a network RAID 1.
+
+	  Each minor device has a role, which can be 'primary' or 'secondary'.
+	  On the node with the primary device the application is supposed to
+	  run and to access the device (/dev/drbdX). Every write is sent to
+	  the local 'lower level block device' and, across the network, to the
+	  node with the device in 'secondary' state.  The secondary device
+	  simply writes the data to its lower level block device.
+
+	  DRBD can also be used in dual-Primary mode (device writable on both
+	  nodes), which means it can exhibit shared disk semantics in a
+	  shared-nothing cluster.  Needless to say, on top of dual-Primary
+	  DRBD utilizing a cluster file system is necessary to maintain for
+	  cache coherency.
+
+	  For automatic failover you need a cluster manager (e.g. heartbeat).
+	  See also: http://www.drbd.org/, http://www.linux-ha.org
+
+	  If unsure, say N.
+
+config DRBD_TRACE
+	tristate "DRBD tracing"
+	depends on BLK_DEV_DRBD
+	select TRACEPOINTS
+	default n
+	help
+
+	  Say Y here if you want to be able to trace various events in DRBD.
+
+	  If unsure, say N.
+
+config DRBD_FAULT_INJECTION
+	bool "DRBD fault injection"
+	depends on BLK_DEV_DRBD
+	help
+
+	  Say Y here if you want to simulate IO errors, in order to test DRBD's
+	  behavior.
+
+	  The actual simulation of IO errors is done by writing 3 values to
+	  /sys/module/drbd/parameters/
+
+	  enable_faults: bitmask of...
+	  1	meta data write
+	  2               read
+	  4	resync data write
+	  8	            read
+	  16	data write
+	  32	data read
+	  64	read ahead
+	  128	kmalloc of bitmap
+	  256	allocation of EE (epoch_entries)
+
+	  fault_devs: bitmask of minor numbers
+	  fault_rate: frequency in percent
+
+	  Example: Simulate data write errors on /dev/drbd0 with a probability of 5%.
+		echo 16 > /sys/module/drbd/parameters/enable_faults
+		echo 1 > /sys/module/drbd/parameters/fault_devs
+		echo 5 > /sys/module/drbd/parameters/fault_rate
+
+	  If unsure, say N.
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile
new file mode 100644
index 000000000000..7d86ef8a8b40
--- /dev/null
+++ b/drivers/block/drbd/Makefile
@@ -0,0 +1,8 @@
+drbd-y := drbd_bitmap.o drbd_proc.o
+drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
+drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
+
+drbd_trace-y := drbd_tracing.o
+
+obj-$(CONFIG_BLK_DEV_DRBD)     += drbd.o
+obj-$(CONFIG_DRBD_TRACE)       += drbd_trace.o
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
new file mode 100644
index 000000000000..74b4835d3107
--- /dev/null
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -0,0 +1,1484 @@
+/*
+   drbd_actlog.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+#include "drbd_tracing.h"
+#include "drbd_wrappers.h"
+
+/* We maintain a trivial check sum in our on disk activity log.
+ * With that we can ensure correct operation even when the storage
+ * device might do a partial (last) sector write while loosing power.
+ */
+struct __packed al_transaction {
+	u32       magic;
+	u32       tr_number;
+	struct __packed {
+		u32 pos;
+		u32 extent; } updates[1 + AL_EXTENTS_PT];
+	u32       xor_sum;
+};
+
+struct update_odbm_work {
+	struct drbd_work w;
+	unsigned int enr;
+};
+
+struct update_al_work {
+	struct drbd_work w;
+	struct lc_element *al_ext;
+	struct completion event;
+	unsigned int enr;
+	/* if old_enr != LC_FREE, write corresponding bitmap sector, too */
+	unsigned int old_enr;
+};
+
+struct drbd_atodb_wait {
+	atomic_t           count;
+	struct completion  io_done;
+	struct drbd_conf   *mdev;
+	int                error;
+};
+
+
+int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int);
+
+/* The actual tracepoint needs to have constant number of known arguments...
+ */
+void trace_drbd_resync(struct drbd_conf *mdev, int level, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	trace__drbd_resync(mdev, level, fmt, ap);
+	va_end(ap);
+}
+
+static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
+				 struct drbd_backing_dev *bdev,
+				 struct page *page, sector_t sector,
+				 int rw, int size)
+{
+	struct bio *bio;
+	struct drbd_md_io md_io;
+	int ok;
+
+	md_io.mdev = mdev;
+	init_completion(&md_io.event);
+	md_io.error = 0;
+
+	if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags))
+		rw |= (1 << BIO_RW_BARRIER);
+	rw |= ((1<<BIO_RW_UNPLUG) | (1<<BIO_RW_SYNCIO));
+
+ retry:
+	bio = bio_alloc(GFP_NOIO, 1);
+	bio->bi_bdev = bdev->md_bdev;
+	bio->bi_sector = sector;
+	ok = (bio_add_page(bio, page, size, 0) == size);
+	if (!ok)
+		goto out;
+	bio->bi_private = &md_io;
+	bio->bi_end_io = drbd_md_io_complete;
+	bio->bi_rw = rw;
+
+	trace_drbd_bio(mdev, "Md", bio, 0, NULL);
+
+	if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
+		bio_endio(bio, -EIO);
+	else
+		submit_bio(rw, bio);
+	wait_for_completion(&md_io.event);
+	ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0;
+
+	/* check for unsupported barrier op.
+	 * would rather check on EOPNOTSUPP, but that is not reliable.
+	 * don't try again for ANY return value != 0 */
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && !ok)) {
+		/* Try again with no barrier */
+		dev_warn(DEV, "Barriers not supported on meta data device - disabling\n");
+		set_bit(MD_NO_BARRIER, &mdev->flags);
+		rw &= ~(1 << BIO_RW_BARRIER);
+		bio_put(bio);
+		goto retry;
+	}
+ out:
+	bio_put(bio);
+	return ok;
+}
+
+int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
+			 sector_t sector, int rw)
+{
+	int logical_block_size, mask, ok;
+	int offset = 0;
+	struct page *iop = mdev->md_io_page;
+
+	D_ASSERT(mutex_is_locked(&mdev->md_io_mutex));
+
+	BUG_ON(!bdev->md_bdev);
+
+	logical_block_size = bdev_logical_block_size(bdev->md_bdev);
+	if (logical_block_size == 0)
+		logical_block_size = MD_SECTOR_SIZE;
+
+	/* in case logical_block_size != 512 [ s390 only? ] */
+	if (logical_block_size != MD_SECTOR_SIZE) {
+		mask = (logical_block_size / MD_SECTOR_SIZE) - 1;
+		D_ASSERT(mask == 1 || mask == 3 || mask == 7);
+		D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE);
+		offset = sector & mask;
+		sector = sector & ~mask;
+		iop = mdev->md_io_tmpp;
+
+		if (rw & WRITE) {
+			/* these are GFP_KERNEL pages, pre-allocated
+			 * on device initialization */
+			void *p = page_address(mdev->md_io_page);
+			void *hp = page_address(mdev->md_io_tmpp);
+
+			ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector,
+					READ, logical_block_size);
+
+			if (unlikely(!ok)) {
+				dev_err(DEV, "drbd_md_sync_page_io(,%llus,"
+				    "READ [logical_block_size!=512]) failed!\n",
+				    (unsigned long long)sector);
+				return 0;
+			}
+
+			memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE);
+		}
+	}
+
+	if (sector < drbd_md_first_sector(bdev) ||
+	    sector > drbd_md_last_sector(bdev))
+		dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
+		     current->comm, current->pid, __func__,
+		     (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
+
+	ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size);
+	if (unlikely(!ok)) {
+		dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n",
+		    (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
+		return 0;
+	}
+
+	if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) {
+		void *p = page_address(mdev->md_io_page);
+		void *hp = page_address(mdev->md_io_tmpp);
+
+		memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE);
+	}
+
+	return ok;
+}
+
+static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
+{
+	struct lc_element *al_ext;
+	struct lc_element *tmp;
+	unsigned long     al_flags = 0;
+
+	spin_lock_irq(&mdev->al_lock);
+	tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
+	if (unlikely(tmp != NULL)) {
+		struct bm_extent  *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+		if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
+			spin_unlock_irq(&mdev->al_lock);
+			return NULL;
+		}
+	}
+	al_ext   = lc_get(mdev->act_log, enr);
+	al_flags = mdev->act_log->flags;
+	spin_unlock_irq(&mdev->al_lock);
+
+	/*
+	if (!al_ext) {
+		if (al_flags & LC_STARVING)
+			dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n");
+		if (al_flags & LC_DIRTY)
+			dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n");
+	}
+	*/
+
+	return al_ext;
+}
+
+void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector)
+{
+	unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
+	struct lc_element *al_ext;
+	struct update_al_work al_work;
+
+	D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
+
+	trace_drbd_actlog(mdev, sector, "al_begin_io");
+
+	wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr)));
+
+	if (al_ext->lc_number != enr) {
+		/* drbd_al_write_transaction(mdev,al_ext,enr);
+		 * recurses into generic_make_request(), which
+		 * disallows recursion, bios being serialized on the
+		 * current->bio_tail list now.
+		 * we have to delegate updates to the activity log
+		 * to the worker thread. */
+		init_completion(&al_work.event);
+		al_work.al_ext = al_ext;
+		al_work.enr = enr;
+		al_work.old_enr = al_ext->lc_number;
+		al_work.w.cb = w_al_write_transaction;
+		drbd_queue_work_front(&mdev->data.work, &al_work.w);
+		wait_for_completion(&al_work.event);
+
+		mdev->al_writ_cnt++;
+
+		spin_lock_irq(&mdev->al_lock);
+		lc_changed(mdev->act_log, al_ext);
+		spin_unlock_irq(&mdev->al_lock);
+		wake_up(&mdev->al_wait);
+	}
+}
+
+void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector)
+{
+	unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
+	struct lc_element *extent;
+	unsigned long flags;
+
+	trace_drbd_actlog(mdev, sector, "al_complete_io");
+
+	spin_lock_irqsave(&mdev->al_lock, flags);
+
+	extent = lc_find(mdev->act_log, enr);
+
+	if (!extent) {
+		spin_unlock_irqrestore(&mdev->al_lock, flags);
+		dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);
+		return;
+	}
+
+	if (lc_put(mdev->act_log, extent) == 0)
+		wake_up(&mdev->al_wait);
+
+	spin_unlock_irqrestore(&mdev->al_lock, flags);
+}
+
+int
+w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+	struct update_al_work *aw = container_of(w, struct update_al_work, w);
+	struct lc_element *updated = aw->al_ext;
+	const unsigned int new_enr = aw->enr;
+	const unsigned int evicted = aw->old_enr;
+	struct al_transaction *buffer;
+	sector_t sector;
+	int i, n, mx;
+	unsigned int extent_nr;
+	u32 xor_sum = 0;
+
+	if (!get_ldev(mdev)) {
+		dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n");
+		complete(&((struct update_al_work *)w)->event);
+		return 1;
+	}
+	/* do we have to do a bitmap write, first?
+	 * TODO reduce maximum latency:
+	 * submit both bios, then wait for both,
+	 * instead of doing two synchronous sector writes. */
+	if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
+		drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT);
+
+	mutex_lock(&mdev->md_io_mutex); /* protects md_io_page, al_tr_cycle, ... */
+	buffer = (struct al_transaction *)page_address(mdev->md_io_page);
+
+	buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
+	buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
+
+	n = lc_index_of(mdev->act_log, updated);
+
+	buffer->updates[0].pos = cpu_to_be32(n);
+	buffer->updates[0].extent = cpu_to_be32(new_enr);
+
+	xor_sum ^= new_enr;
+
+	mx = min_t(int, AL_EXTENTS_PT,
+		   mdev->act_log->nr_elements - mdev->al_tr_cycle);
+	for (i = 0; i < mx; i++) {
+		unsigned idx = mdev->al_tr_cycle + i;
+		extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number;
+		buffer->updates[i+1].pos = cpu_to_be32(idx);
+		buffer->updates[i+1].extent = cpu_to_be32(extent_nr);
+		xor_sum ^= extent_nr;
+	}
+	for (; i < AL_EXTENTS_PT; i++) {
+		buffer->updates[i+1].pos = __constant_cpu_to_be32(-1);
+		buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE);
+		xor_sum ^= LC_FREE;
+	}
+	mdev->al_tr_cycle += AL_EXTENTS_PT;
+	if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
+		mdev->al_tr_cycle = 0;
+
+	buffer->xor_sum = cpu_to_be32(xor_sum);
+
+	sector =  mdev->ldev->md.md_offset
+		+ mdev->ldev->md.al_offset + mdev->al_tr_pos;
+
+	if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE))
+		drbd_chk_io_error(mdev, 1, TRUE);
+
+	if (++mdev->al_tr_pos >
+	    div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
+		mdev->al_tr_pos = 0;
+
+	D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);
+	mdev->al_tr_number++;
+
+	mutex_unlock(&mdev->md_io_mutex);
+
+	complete(&((struct update_al_work *)w)->event);
+	put_ldev(mdev);
+
+	return 1;
+}
+
+/**
+ * drbd_al_read_tr() - Read a single transaction from the on disk activity log
+ * @mdev:	DRBD device.
+ * @bdev:	Block device to read form.
+ * @b:		pointer to an al_transaction.
+ * @index:	On disk slot of the transaction to read.
+ *
+ * Returns -1 on IO error, 0 on checksum error and 1 upon success.
+ */
+static int drbd_al_read_tr(struct drbd_conf *mdev,
+			   struct drbd_backing_dev *bdev,
+			   struct al_transaction *b,
+			   int index)
+{
+	sector_t sector;
+	int rv, i;
+	u32 xor_sum = 0;
+
+	sector = bdev->md.md_offset + bdev->md.al_offset + index;
+
+	/* Dont process error normally,
+	 * as this is done before disk is attached! */
+	if (!drbd_md_sync_page_io(mdev, bdev, sector, READ))
+		return -1;
+
+	rv = (be32_to_cpu(b->magic) == DRBD_MAGIC);
+
+	for (i = 0; i < AL_EXTENTS_PT + 1; i++)
+		xor_sum ^= be32_to_cpu(b->updates[i].extent);
+	rv &= (xor_sum == be32_to_cpu(b->xor_sum));
+
+	return rv;
+}
+
+/**
+ * drbd_al_read_log() - Restores the activity log from its on disk representation.
+ * @mdev:	DRBD device.
+ * @bdev:	Block device to read form.
+ *
+ * Returns 1 on success, returns 0 when reading the log failed due to IO errors.
+ */
+int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
+{
+	struct al_transaction *buffer;
+	int i;
+	int rv;
+	int mx;
+	int active_extents = 0;
+	int transactions = 0;
+	int found_valid = 0;
+	int from = 0;
+	int to = 0;
+	u32 from_tnr = 0;
+	u32 to_tnr = 0;
+	u32 cnr;
+
+	mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT);
+
+	/* lock out all other meta data io for now,
+	 * and make sure the page is mapped.
+	 */
+	mutex_lock(&mdev->md_io_mutex);
+	buffer = page_address(mdev->md_io_page);
+
+	/* Find the valid transaction in the log */
+	for (i = 0; i <= mx; i++) {
+		rv = drbd_al_read_tr(mdev, bdev, buffer, i);
+		if (rv == 0)
+			continue;
+		if (rv == -1) {
+			mutex_unlock(&mdev->md_io_mutex);
+			return 0;
+		}
+		cnr = be32_to_cpu(buffer->tr_number);
+
+		if (++found_valid == 1) {
+			from = i;
+			to = i;
+			from_tnr = cnr;
+			to_tnr = cnr;
+			continue;
+		}
+		if ((int)cnr - (int)from_tnr < 0) {
+			D_ASSERT(from_tnr - cnr + i - from == mx+1);
+			from = i;
+			from_tnr = cnr;
+		}
+		if ((int)cnr - (int)to_tnr > 0) {
+			D_ASSERT(cnr - to_tnr == i - to);
+			to = i;
+			to_tnr = cnr;
+		}
+	}
+
+	if (!found_valid) {
+		dev_warn(DEV, "No usable activity log found.\n");
+		mutex_unlock(&mdev->md_io_mutex);
+		return 1;
+	}
+
+	/* Read the valid transactions.
+	 * dev_info(DEV, "Reading from %d to %d.\n",from,to); */
+	i = from;
+	while (1) {
+		int j, pos;
+		unsigned int extent_nr;
+		unsigned int trn;
+
+		rv = drbd_al_read_tr(mdev, bdev, buffer, i);
+		ERR_IF(rv == 0) goto cancel;
+		if (rv == -1) {
+			mutex_unlock(&mdev->md_io_mutex);
+			return 0;
+		}
+
+		trn = be32_to_cpu(buffer->tr_number);
+
+		spin_lock_irq(&mdev->al_lock);
+
+		/* This loop runs backwards because in the cyclic
+		   elements there might be an old version of the
+		   updated element (in slot 0). So the element in slot 0
+		   can overwrite old versions. */
+		for (j = AL_EXTENTS_PT; j >= 0; j--) {
+			pos = be32_to_cpu(buffer->updates[j].pos);
+			extent_nr = be32_to_cpu(buffer->updates[j].extent);
+
+			if (extent_nr == LC_FREE)
+				continue;
+
+			lc_set(mdev->act_log, extent_nr, pos);
+			active_extents++;
+		}
+		spin_unlock_irq(&mdev->al_lock);
+
+		transactions++;
+
+cancel:
+		if (i == to)
+			break;
+		i++;
+		if (i > mx)
+			i = 0;
+	}
+
+	mdev->al_tr_number = to_tnr+1;
+	mdev->al_tr_pos = to;
+	if (++mdev->al_tr_pos >
+	    div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
+		mdev->al_tr_pos = 0;
+
+	/* ok, we are done with it */
+	mutex_unlock(&mdev->md_io_mutex);
+
+	dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
+	     transactions, active_extents);
+
+	return 1;
+}
+
+static void atodb_endio(struct bio *bio, int error)
+{
+	struct drbd_atodb_wait *wc = bio->bi_private;
+	struct drbd_conf *mdev = wc->mdev;
+	struct page *page;
+	int uptodate = bio_flagged(bio, BIO_UPTODATE);
+
+	/* strange behavior of some lower level drivers...
+	 * fail the request by clearing the uptodate flag,
+	 * but do not return any error?! */
+	if (!error && !uptodate)
+		error = -EIO;
+
+	drbd_chk_io_error(mdev, error, TRUE);
+	if (error && wc->error == 0)
+		wc->error = error;
+
+	if (atomic_dec_and_test(&wc->count))
+		complete(&wc->io_done);
+
+	page = bio->bi_io_vec[0].bv_page;
+	put_page(page);
+	bio_put(bio);
+	mdev->bm_writ_cnt++;
+	put_ldev(mdev);
+}
+
+#define S2W(s)	((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
+/* activity log to on disk bitmap -- prepare bio unless that sector
+ * is already covered by previously prepared bios */
+static int atodb_prepare_unless_covered(struct drbd_conf *mdev,
+					struct bio **bios,
+					unsigned int enr,
+					struct drbd_atodb_wait *wc) __must_hold(local)
+{
+	struct bio *bio;
+	struct page *page;
+	sector_t on_disk_sector = enr + mdev->ldev->md.md_offset
+				      + mdev->ldev->md.bm_offset;
+	unsigned int page_offset = PAGE_SIZE;
+	int offset;
+	int i = 0;
+	int err = -ENOMEM;
+
+	/* Check if that enr is already covered by an already created bio.
+	 * Caution, bios[] is not NULL terminated,
+	 * but only initialized to all NULL.
+	 * For completely scattered activity log,
+	 * the last invocation iterates over all bios,
+	 * and finds the last NULL entry.
+	 */
+	while ((bio = bios[i])) {
+		if (bio->bi_sector == on_disk_sector)
+			return 0;
+		i++;
+	}
+	/* bios[i] == NULL, the next not yet used slot */
+
+	/* GFP_KERNEL, we are not in the write-out path */
+	bio = bio_alloc(GFP_KERNEL, 1);
+	if (bio == NULL)
+		return -ENOMEM;
+
+	if (i > 0) {
+		const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec;
+		page_offset = prev_bv->bv_offset + prev_bv->bv_len;
+		page = prev_bv->bv_page;
+	}
+	if (page_offset == PAGE_SIZE) {
+		page = alloc_page(__GFP_HIGHMEM);
+		if (page == NULL)
+			goto out_bio_put;
+		page_offset = 0;
+	} else {
+		get_page(page);
+	}
+
+	offset = S2W(enr);
+	drbd_bm_get_lel(mdev, offset,
+			min_t(size_t, S2W(1), drbd_bm_words(mdev) - offset),
+			kmap(page) + page_offset);
+	kunmap(page);
+
+	bio->bi_private = wc;
+	bio->bi_end_io = atodb_endio;
+	bio->bi_bdev = mdev->ldev->md_bdev;
+	bio->bi_sector = on_disk_sector;
+
+	if (bio_add_page(bio, page, MD_SECTOR_SIZE, page_offset) != MD_SECTOR_SIZE)
+		goto out_put_page;
+
+	atomic_inc(&wc->count);
+	/* we already know that we may do this...
+	 * get_ldev_if_state(mdev,D_ATTACHING);
+	 * just get the extra reference, so that the local_cnt reflects
+	 * the number of pending IO requests DRBD at its backing device.
+	 */
+	atomic_inc(&mdev->local_cnt);
+
+	bios[i] = bio;
+
+	return 0;
+
+out_put_page:
+	err = -EINVAL;
+	put_page(page);
+out_bio_put:
+	bio_put(bio);
+	return err;
+}
+
+/**
+ * drbd_al_to_on_disk_bm() -  * Writes bitmap parts covered by active AL extents
+ * @mdev:	DRBD device.
+ *
+ * Called when we detach (unconfigure) local storage,
+ * or when we go from R_PRIMARY to R_SECONDARY role.
+ */
+void drbd_al_to_on_disk_bm(struct drbd_conf *mdev)
+{
+	int i, nr_elements;
+	unsigned int enr;
+	struct bio **bios;
+	struct drbd_atodb_wait wc;
+
+	ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING))
+		return; /* sorry, I don't have any act_log etc... */
+
+	wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+
+	nr_elements = mdev->act_log->nr_elements;
+
+	/* GFP_KERNEL, we are not in anyone's write-out path */
+	bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL);
+	if (!bios)
+		goto submit_one_by_one;
+
+	atomic_set(&wc.count, 0);
+	init_completion(&wc.io_done);
+	wc.mdev = mdev;
+	wc.error = 0;
+
+	for (i = 0; i < nr_elements; i++) {
+		enr = lc_element_by_index(mdev->act_log, i)->lc_number;
+		if (enr == LC_FREE)
+			continue;
+		/* next statement also does atomic_inc wc.count and local_cnt */
+		if (atodb_prepare_unless_covered(mdev, bios,
+						enr/AL_EXT_PER_BM_SECT,
+						&wc))
+			goto free_bios_submit_one_by_one;
+	}
+
+	/* unnecessary optimization? */
+	lc_unlock(mdev->act_log);
+	wake_up(&mdev->al_wait);
+
+	/* all prepared, submit them */
+	for (i = 0; i < nr_elements; i++) {
+		if (bios[i] == NULL)
+			break;
+		if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) {
+			bios[i]->bi_rw = WRITE;
+			bio_endio(bios[i], -EIO);
+		} else {
+			submit_bio(WRITE, bios[i]);
+		}
+	}
+
+	drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
+
+	/* always (try to) flush bitmap to stable storage */
+	drbd_md_flush(mdev);
+
+	/* In case we did not submit a single IO do not wait for
+	 * them to complete. ( Because we would wait forever here. )
+	 *
+	 * In case we had IOs and they are already complete, there
+	 * is not point in waiting anyways.
+	 * Therefore this if () ... */
+	if (atomic_read(&wc.count))
+		wait_for_completion(&wc.io_done);
+
+	put_ldev(mdev);
+
+	kfree(bios);
+	return;
+
+ free_bios_submit_one_by_one:
+	/* free everything by calling the endio callback directly. */
+	for (i = 0; i < nr_elements && bios[i]; i++)
+		bio_endio(bios[i], 0);
+
+	kfree(bios);
+
+ submit_one_by_one:
+	dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n");
+
+	for (i = 0; i < mdev->act_log->nr_elements; i++) {
+		enr = lc_element_by_index(mdev->act_log, i)->lc_number;
+		if (enr == LC_FREE)
+			continue;
+		/* Really slow: if we have al-extents 16..19 active,
+		 * sector 4 will be written four times! Synchronous! */
+		drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT);
+	}
+
+	lc_unlock(mdev->act_log);
+	wake_up(&mdev->al_wait);
+	put_ldev(mdev);
+}
+
+/**
+ * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents
+ * @mdev:	DRBD device.
+ */
+void drbd_al_apply_to_bm(struct drbd_conf *mdev)
+{
+	unsigned int enr;
+	unsigned long add = 0;
+	char ppb[10];
+	int i;
+
+	wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+
+	for (i = 0; i < mdev->act_log->nr_elements; i++) {
+		enr = lc_element_by_index(mdev->act_log, i)->lc_number;
+		if (enr == LC_FREE)
+			continue;
+		add += drbd_bm_ALe_set_all(mdev, enr);
+	}
+
+	lc_unlock(mdev->act_log);
+	wake_up(&mdev->al_wait);
+
+	dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n",
+	     ppsize(ppb, Bit2KB(add)));
+}
+
+static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
+{
+	int rv;
+
+	spin_lock_irq(&mdev->al_lock);
+	rv = (al_ext->refcnt == 0);
+	if (likely(rv))
+		lc_del(mdev->act_log, al_ext);
+	spin_unlock_irq(&mdev->al_lock);
+
+	return rv;
+}
+
+/**
+ * drbd_al_shrink() - Removes all active extents form the activity log
+ * @mdev:	DRBD device.
+ *
+ * Removes all active extents form the activity log, waiting until
+ * the reference count of each entry dropped to 0 first, of course.
+ *
+ * You need to lock mdev->act_log with lc_try_lock() / lc_unlock()
+ */
+void drbd_al_shrink(struct drbd_conf *mdev)
+{
+	struct lc_element *al_ext;
+	int i;
+
+	D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags));
+
+	for (i = 0; i < mdev->act_log->nr_elements; i++) {
+		al_ext = lc_element_by_index(mdev->act_log, i);
+		if (al_ext->lc_number == LC_FREE)
+			continue;
+		wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext));
+	}
+
+	wake_up(&mdev->al_wait);
+}
+
+static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+	struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
+
+	if (!get_ldev(mdev)) {
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n");
+		kfree(udw);
+		return 1;
+	}
+
+	drbd_bm_write_sect(mdev, udw->enr);
+	put_ldev(mdev);
+
+	kfree(udw);
+
+	if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) {
+		switch (mdev->state.conn) {
+		case C_SYNC_SOURCE:  case C_SYNC_TARGET:
+		case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T:
+			drbd_resync_finished(mdev);
+		default:
+			/* nothing to do */
+			break;
+		}
+	}
+	drbd_bcast_sync_progress(mdev);
+
+	return 1;
+}
+
+
+/* ATTENTION. The AL's extents are 4MB each, while the extents in the
+ * resync LRU-cache are 16MB each.
+ * The caller of this function has to hold an get_ldev() reference.
+ *
+ * TODO will be obsoleted once we have a caching lru of the on disk bitmap
+ */
+static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
+				      int count, int success)
+{
+	struct lc_element *e;
+	struct update_odbm_work *udw;
+
+	unsigned int enr;
+
+	D_ASSERT(atomic_read(&mdev->local_cnt));
+
+	/* I simply assume that a sector/size pair never crosses
+	 * a 16 MB extent border. (Currently this is true...) */
+	enr = BM_SECT_TO_EXT(sector);
+
+	e = lc_get(mdev->resync, enr);
+	if (e) {
+		struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
+		if (ext->lce.lc_number == enr) {
+			if (success)
+				ext->rs_left -= count;
+			else
+				ext->rs_failed += count;
+			if (ext->rs_left < ext->rs_failed) {
+				dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
+				    "rs_failed=%d count=%d\n",
+				     (unsigned long long)sector,
+				     ext->lce.lc_number, ext->rs_left,
+				     ext->rs_failed, count);
+				dump_stack();
+
+				lc_put(mdev->resync, &ext->lce);
+				drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+				return;
+			}
+		} else {
+			/* Normally this element should be in the cache,
+			 * since drbd_rs_begin_io() pulled it already in.
+			 *
+			 * But maybe an application write finished, and we set
+			 * something outside the resync lru_cache in sync.
+			 */
+			int rs_left = drbd_bm_e_weight(mdev, enr);
+			if (ext->flags != 0) {
+				dev_warn(DEV, "changing resync lce: %d[%u;%02lx]"
+				     " -> %d[%u;00]\n",
+				     ext->lce.lc_number, ext->rs_left,
+				     ext->flags, enr, rs_left);
+				ext->flags = 0;
+			}
+			if (ext->rs_failed) {
+				dev_warn(DEV, "Kicking resync_lru element enr=%u "
+				     "out with rs_failed=%d\n",
+				     ext->lce.lc_number, ext->rs_failed);
+				set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
+			}
+			ext->rs_left = rs_left;
+			ext->rs_failed = success ? 0 : count;
+			lc_changed(mdev->resync, &ext->lce);
+		}
+		lc_put(mdev->resync, &ext->lce);
+		/* no race, we are within the al_lock! */
+
+		if (ext->rs_left == ext->rs_failed) {
+			ext->rs_failed = 0;
+
+			udw = kmalloc(sizeof(*udw), GFP_ATOMIC);
+			if (udw) {
+				udw->enr = ext->lce.lc_number;
+				udw->w.cb = w_update_odbm;
+				drbd_queue_work_front(&mdev->data.work, &udw->w);
+			} else {
+				dev_warn(DEV, "Could not kmalloc an udw\n");
+				set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
+			}
+		}
+	} else {
+		dev_err(DEV, "lc_get() failed! locked=%d/%d flags=%lu\n",
+		    mdev->resync_locked,
+		    mdev->resync->nr_elements,
+		    mdev->resync->flags);
+	}
+}
+
+/* clear the bit corresponding to the piece of storage in question:
+ * size byte of data starting from sector.  Only clear a bits of the affected
+ * one ore more _aligned_ BM_BLOCK_SIZE blocks.
+ *
+ * called by worker on C_SYNC_TARGET and receiver on SyncSource.
+ *
+ */
+void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
+		       const char *file, const unsigned int line)
+{
+	/* Is called from worker and receiver context _only_ */
+	unsigned long sbnr, ebnr, lbnr;
+	unsigned long count = 0;
+	sector_t esector, nr_sectors;
+	int wake_up = 0;
+	unsigned long flags;
+
+	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
+		dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
+				(unsigned long long)sector, size);
+		return;
+	}
+	nr_sectors = drbd_get_capacity(mdev->this_bdev);
+	esector = sector + (size >> 9) - 1;
+
+	ERR_IF(sector >= nr_sectors) return;
+	ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
+
+	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+
+	/* we clear it (in sync).
+	 * round up start sector, round down end sector.  we make sure we only
+	 * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
+	if (unlikely(esector < BM_SECT_PER_BIT-1))
+		return;
+	if (unlikely(esector == (nr_sectors-1)))
+		ebnr = lbnr;
+	else
+		ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
+	sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
+
+	trace_drbd_resync(mdev, TRACE_LVL_METRICS,
+			  "drbd_set_in_sync: sector=%llus size=%u sbnr=%lu ebnr=%lu\n",
+			  (unsigned long long)sector, size, sbnr, ebnr);
+
+	if (sbnr > ebnr)
+		return;
+
+	/*
+	 * ok, (capacity & 7) != 0 sometimes, but who cares...
+	 * we count rs_{total,left} in bits, not sectors.
+	 */
+	spin_lock_irqsave(&mdev->al_lock, flags);
+	count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
+	if (count) {
+		/* we need the lock for drbd_try_clear_on_disk_bm */
+		if (jiffies - mdev->rs_mark_time > HZ*10) {
+			/* should be rolling marks,
+			 * but we estimate only anyways. */
+			if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) &&
+			    mdev->state.conn != C_PAUSED_SYNC_T &&
+			    mdev->state.conn != C_PAUSED_SYNC_S) {
+				mdev->rs_mark_time = jiffies;
+				mdev->rs_mark_left = drbd_bm_total_weight(mdev);
+			}
+		}
+		if (get_ldev(mdev)) {
+			drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE);
+			put_ldev(mdev);
+		}
+		/* just wake_up unconditional now, various lc_chaged(),
+		 * lc_put() in drbd_try_clear_on_disk_bm(). */
+		wake_up = 1;
+	}
+	spin_unlock_irqrestore(&mdev->al_lock, flags);
+	if (wake_up)
+		wake_up(&mdev->al_wait);
+}
+
+/*
+ * this is intended to set one request worth of data out of sync.
+ * affects at least 1 bit,
+ * and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits.
+ *
+ * called by tl_clear and drbd_send_dblock (==drbd_make_request).
+ * so this can be _any_ process.
+ */
+void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
+			    const char *file, const unsigned int line)
+{
+	unsigned long sbnr, ebnr, lbnr, flags;
+	sector_t esector, nr_sectors;
+	unsigned int enr, count;
+	struct lc_element *e;
+
+	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
+		dev_err(DEV, "sector: %llus, size: %d\n",
+			(unsigned long long)sector, size);
+		return;
+	}
+
+	if (!get_ldev(mdev))
+		return; /* no disk, no metadata, no bitmap to set bits in */
+
+	nr_sectors = drbd_get_capacity(mdev->this_bdev);
+	esector = sector + (size >> 9) - 1;
+
+	ERR_IF(sector >= nr_sectors)
+		goto out;
+	ERR_IF(esector >= nr_sectors)
+		esector = (nr_sectors-1);
+
+	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+
+	/* we set it out of sync,
+	 * we do not need to round anything here */
+	sbnr = BM_SECT_TO_BIT(sector);
+	ebnr = BM_SECT_TO_BIT(esector);
+
+	trace_drbd_resync(mdev, TRACE_LVL_METRICS,
+			  "drbd_set_out_of_sync: sector=%llus size=%u sbnr=%lu ebnr=%lu\n",
+			  (unsigned long long)sector, size, sbnr, ebnr);
+
+	/* ok, (capacity & 7) != 0 sometimes, but who cares...
+	 * we count rs_{total,left} in bits, not sectors.  */
+	spin_lock_irqsave(&mdev->al_lock, flags);
+	count = drbd_bm_set_bits(mdev, sbnr, ebnr);
+
+	enr = BM_SECT_TO_EXT(sector);
+	e = lc_find(mdev->resync, enr);
+	if (e)
+		lc_entry(e, struct bm_extent, lce)->rs_left += count;
+	spin_unlock_irqrestore(&mdev->al_lock, flags);
+
+out:
+	put_ldev(mdev);
+}
+
+static
+struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)
+{
+	struct lc_element *e;
+	struct bm_extent *bm_ext;
+	int wakeup = 0;
+	unsigned long rs_flags;
+
+	spin_lock_irq(&mdev->al_lock);
+	if (mdev->resync_locked > mdev->resync->nr_elements/2) {
+		spin_unlock_irq(&mdev->al_lock);
+		return NULL;
+	}
+	e = lc_get(mdev->resync, enr);
+	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+	if (bm_ext) {
+		if (bm_ext->lce.lc_number != enr) {
+			bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
+			bm_ext->rs_failed = 0;
+			lc_changed(mdev->resync, &bm_ext->lce);
+			wakeup = 1;
+		}
+		if (bm_ext->lce.refcnt == 1)
+			mdev->resync_locked++;
+		set_bit(BME_NO_WRITES, &bm_ext->flags);
+	}
+	rs_flags = mdev->resync->flags;
+	spin_unlock_irq(&mdev->al_lock);
+	if (wakeup)
+		wake_up(&mdev->al_wait);
+
+	if (!bm_ext) {
+		if (rs_flags & LC_STARVING)
+			dev_warn(DEV, "Have to wait for element"
+			     " (resync LRU too small?)\n");
+		BUG_ON(rs_flags & LC_DIRTY);
+	}
+
+	return bm_ext;
+}
+
+static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
+{
+	struct lc_element *al_ext;
+	int rv = 0;
+
+	spin_lock_irq(&mdev->al_lock);
+	if (unlikely(enr == mdev->act_log->new_number))
+		rv = 1;
+	else {
+		al_ext = lc_find(mdev->act_log, enr);
+		if (al_ext) {
+			if (al_ext->refcnt)
+				rv = 1;
+		}
+	}
+	spin_unlock_irq(&mdev->al_lock);
+
+	/*
+	if (unlikely(rv)) {
+		dev_info(DEV, "Delaying sync read until app's write is done\n");
+	}
+	*/
+	return rv;
+}
+
+/**
+ * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED
+ * @mdev:	DRBD device.
+ * @sector:	The sector number.
+ *
+ * This functions sleeps on al_wait. Returns 1 on success, 0 if interrupted.
+ */
+int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
+{
+	unsigned int enr = BM_SECT_TO_EXT(sector);
+	struct bm_extent *bm_ext;
+	int i, sig;
+
+	trace_drbd_resync(mdev, TRACE_LVL_ALL,
+			  "drbd_rs_begin_io: sector=%llus (rs_end=%d)\n",
+			  (unsigned long long)sector, enr);
+
+	sig = wait_event_interruptible(mdev->al_wait,
+			(bm_ext = _bme_get(mdev, enr)));
+	if (sig)
+		return 0;
+
+	if (test_bit(BME_LOCKED, &bm_ext->flags))
+		return 1;
+
+	for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
+		sig = wait_event_interruptible(mdev->al_wait,
+				!_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i));
+		if (sig) {
+			spin_lock_irq(&mdev->al_lock);
+			if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
+				clear_bit(BME_NO_WRITES, &bm_ext->flags);
+				mdev->resync_locked--;
+				wake_up(&mdev->al_wait);
+			}
+			spin_unlock_irq(&mdev->al_lock);
+			return 0;
+		}
+	}
+
+	set_bit(BME_LOCKED, &bm_ext->flags);
+
+	return 1;
+}
+
+/**
+ * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep
+ * @mdev:	DRBD device.
+ * @sector:	The sector number.
+ *
+ * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then
+ * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN
+ * if there is still application IO going on in this area.
+ */
+int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
+{
+	unsigned int enr = BM_SECT_TO_EXT(sector);
+	const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT;
+	struct lc_element *e;
+	struct bm_extent *bm_ext;
+	int i;
+
+	trace_drbd_resync(mdev, TRACE_LVL_ALL, "drbd_try_rs_begin_io: sector=%llus\n",
+			  (unsigned long long)sector);
+
+	spin_lock_irq(&mdev->al_lock);
+	if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) {
+		/* in case you have very heavy scattered io, it may
+		 * stall the syncer undefined if we give up the ref count
+		 * when we try again and requeue.
+		 *
+		 * if we don't give up the refcount, but the next time
+		 * we are scheduled this extent has been "synced" by new
+		 * application writes, we'd miss the lc_put on the
+		 * extent we keep the refcount on.
+		 * so we remembered which extent we had to try again, and
+		 * if the next requested one is something else, we do
+		 * the lc_put here...
+		 * we also have to wake_up
+		 */
+
+		trace_drbd_resync(mdev, TRACE_LVL_ALL,
+				  "dropping %u, apparently got 'synced' by application io\n",
+				  mdev->resync_wenr);
+
+		e = lc_find(mdev->resync, mdev->resync_wenr);
+		bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+		if (bm_ext) {
+			D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
+			D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
+			clear_bit(BME_NO_WRITES, &bm_ext->flags);
+			mdev->resync_wenr = LC_FREE;
+			if (lc_put(mdev->resync, &bm_ext->lce) == 0)
+				mdev->resync_locked--;
+			wake_up(&mdev->al_wait);
+		} else {
+			dev_alert(DEV, "LOGIC BUG\n");
+		}
+	}
+	/* TRY. */
+	e = lc_try_get(mdev->resync, enr);
+	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+	if (bm_ext) {
+		if (test_bit(BME_LOCKED, &bm_ext->flags))
+			goto proceed;
+		if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) {
+			mdev->resync_locked++;
+		} else {
+			/* we did set the BME_NO_WRITES,
+			 * but then could not set BME_LOCKED,
+			 * so we tried again.
+			 * drop the extra reference. */
+			trace_drbd_resync(mdev, TRACE_LVL_ALL,
+					  "dropping extra reference on %u\n", enr);
+
+			bm_ext->lce.refcnt--;
+			D_ASSERT(bm_ext->lce.refcnt > 0);
+		}
+		goto check_al;
+	} else {
+		/* do we rather want to try later? */
+		if (mdev->resync_locked > mdev->resync->nr_elements-3) {
+			trace_drbd_resync(mdev, TRACE_LVL_ALL,
+					  "resync_locked = %u!\n", mdev->resync_locked);
+
+			goto try_again;
+		}
+		/* Do or do not. There is no try. -- Yoda */
+		e = lc_get(mdev->resync, enr);
+		bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+		if (!bm_ext) {
+			const unsigned long rs_flags = mdev->resync->flags;
+			if (rs_flags & LC_STARVING)
+				dev_warn(DEV, "Have to wait for element"
+				     " (resync LRU too small?)\n");
+			BUG_ON(rs_flags & LC_DIRTY);
+			goto try_again;
+		}
+		if (bm_ext->lce.lc_number != enr) {
+			bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
+			bm_ext->rs_failed = 0;
+			lc_changed(mdev->resync, &bm_ext->lce);
+			wake_up(&mdev->al_wait);
+			D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0);
+		}
+		set_bit(BME_NO_WRITES, &bm_ext->flags);
+		D_ASSERT(bm_ext->lce.refcnt == 1);
+		mdev->resync_locked++;
+		goto check_al;
+	}
+check_al:
+	trace_drbd_resync(mdev, TRACE_LVL_ALL, "checking al for %u\n", enr);
+
+	for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
+		if (unlikely(al_enr+i == mdev->act_log->new_number))
+			goto try_again;
+		if (lc_is_used(mdev->act_log, al_enr+i))
+			goto try_again;
+	}
+	set_bit(BME_LOCKED, &bm_ext->flags);
+proceed:
+	mdev->resync_wenr = LC_FREE;
+	spin_unlock_irq(&mdev->al_lock);
+	return 0;
+
+try_again:
+	trace_drbd_resync(mdev, TRACE_LVL_ALL, "need to try again for %u\n", enr);
+	if (bm_ext)
+		mdev->resync_wenr = enr;
+	spin_unlock_irq(&mdev->al_lock);
+	return -EAGAIN;
+}
+
+void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector)
+{
+	unsigned int enr = BM_SECT_TO_EXT(sector);
+	struct lc_element *e;
+	struct bm_extent *bm_ext;
+	unsigned long flags;
+
+	trace_drbd_resync(mdev, TRACE_LVL_ALL,
+			  "drbd_rs_complete_io: sector=%llus (rs_enr=%d)\n",
+			  (long long)sector, enr);
+
+	spin_lock_irqsave(&mdev->al_lock, flags);
+	e = lc_find(mdev->resync, enr);
+	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+	if (!bm_ext) {
+		spin_unlock_irqrestore(&mdev->al_lock, flags);
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_err(DEV, "drbd_rs_complete_io() called, but extent not found\n");
+		return;
+	}
+
+	if (bm_ext->lce.refcnt == 0) {
+		spin_unlock_irqrestore(&mdev->al_lock, flags);
+		dev_err(DEV, "drbd_rs_complete_io(,%llu [=%u]) called, "
+		    "but refcnt is 0!?\n",
+		    (unsigned long long)sector, enr);
+		return;
+	}
+
+	if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
+		clear_bit(BME_LOCKED, &bm_ext->flags);
+		clear_bit(BME_NO_WRITES, &bm_ext->flags);
+		mdev->resync_locked--;
+		wake_up(&mdev->al_wait);
+	}
+
+	spin_unlock_irqrestore(&mdev->al_lock, flags);
+}
+
+/**
+ * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED)
+ * @mdev:	DRBD device.
+ */
+void drbd_rs_cancel_all(struct drbd_conf *mdev)
+{
+	trace_drbd_resync(mdev, TRACE_LVL_METRICS, "drbd_rs_cancel_all\n");
+
+	spin_lock_irq(&mdev->al_lock);
+
+	if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */
+		lc_reset(mdev->resync);
+		put_ldev(mdev);
+	}
+	mdev->resync_locked = 0;
+	mdev->resync_wenr = LC_FREE;
+	spin_unlock_irq(&mdev->al_lock);
+	wake_up(&mdev->al_wait);
+}
+
+/**
+ * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU
+ * @mdev:	DRBD device.
+ *
+ * Returns 0 upon success, -EAGAIN if at least one reference count was
+ * not zero.
+ */
+int drbd_rs_del_all(struct drbd_conf *mdev)
+{
+	struct lc_element *e;
+	struct bm_extent *bm_ext;
+	int i;
+
+	trace_drbd_resync(mdev, TRACE_LVL_METRICS, "drbd_rs_del_all\n");
+
+	spin_lock_irq(&mdev->al_lock);
+
+	if (get_ldev_if_state(mdev, D_FAILED)) {
+		/* ok, ->resync is there. */
+		for (i = 0; i < mdev->resync->nr_elements; i++) {
+			e = lc_element_by_index(mdev->resync, i);
+			bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+			if (bm_ext->lce.lc_number == LC_FREE)
+				continue;
+			if (bm_ext->lce.lc_number == mdev->resync_wenr) {
+				dev_info(DEV, "dropping %u in drbd_rs_del_all, apparently"
+				     " got 'synced' by application io\n",
+				     mdev->resync_wenr);
+				D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
+				D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
+				clear_bit(BME_NO_WRITES, &bm_ext->flags);
+				mdev->resync_wenr = LC_FREE;
+				lc_put(mdev->resync, &bm_ext->lce);
+			}
+			if (bm_ext->lce.refcnt != 0) {
+				dev_info(DEV, "Retrying drbd_rs_del_all() later. "
+				     "refcnt=%d\n", bm_ext->lce.refcnt);
+				put_ldev(mdev);
+				spin_unlock_irq(&mdev->al_lock);
+				return -EAGAIN;
+			}
+			D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
+			D_ASSERT(!test_bit(BME_NO_WRITES, &bm_ext->flags));
+			lc_del(mdev->resync, &bm_ext->lce);
+		}
+		D_ASSERT(mdev->resync->used == 0);
+		put_ldev(mdev);
+	}
+	spin_unlock_irq(&mdev->al_lock);
+
+	return 0;
+}
+
+/**
+ * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks
+ * @mdev:	DRBD device.
+ * @sector:	The sector number.
+ * @size:	Size of failed IO operation, in byte.
+ */
+void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
+{
+	/* Is called from worker and receiver context _only_ */
+	unsigned long sbnr, ebnr, lbnr;
+	unsigned long count;
+	sector_t esector, nr_sectors;
+	int wake_up = 0;
+
+	trace_drbd_resync(mdev, TRACE_LVL_SUMMARY,
+			  "drbd_rs_failed_io: sector=%llus, size=%u\n",
+			  (unsigned long long)sector, size);
+
+	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
+		dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
+				(unsigned long long)sector, size);
+		return;
+	}
+	nr_sectors = drbd_get_capacity(mdev->this_bdev);
+	esector = sector + (size >> 9) - 1;
+
+	ERR_IF(sector >= nr_sectors) return;
+	ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
+
+	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+
+	/*
+	 * round up start sector, round down end sector.  we make sure we only
+	 * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */
+	if (unlikely(esector < BM_SECT_PER_BIT-1))
+		return;
+	if (unlikely(esector == (nr_sectors-1)))
+		ebnr = lbnr;
+	else
+		ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
+	sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
+
+	if (sbnr > ebnr)
+		return;
+
+	/*
+	 * ok, (capacity & 7) != 0 sometimes, but who cares...
+	 * we count rs_{total,left} in bits, not sectors.
+	 */
+	spin_lock_irq(&mdev->al_lock);
+	count = drbd_bm_count_bits(mdev, sbnr, ebnr);
+	if (count) {
+		mdev->rs_failed += count;
+
+		if (get_ldev(mdev)) {
+			drbd_try_clear_on_disk_bm(mdev, sector, count, FALSE);
+			put_ldev(mdev);
+		}
+
+		/* just wake_up unconditional now, various lc_chaged(),
+		 * lc_put() in drbd_try_clear_on_disk_bm(). */
+		wake_up = 1;
+	}
+	spin_unlock_irq(&mdev->al_lock);
+	if (wake_up)
+		wake_up(&mdev->al_wait);
+}
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
new file mode 100644
index 000000000000..b61057e77882
--- /dev/null
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -0,0 +1,1327 @@
+/*
+   drbd_bitmap.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2004-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/bitops.h>
+#include <linux/vmalloc.h>
+#include <linux/string.h>
+#include <linux/drbd.h>
+#include <asm/kmap_types.h>
+#include "drbd_int.h"
+
+/* OPAQUE outside this file!
+ * interface defined in drbd_int.h
+
+ * convention:
+ * function name drbd_bm_... => used elsewhere, "public".
+ * function name      bm_... => internal to implementation, "private".
+
+ * Note that since find_first_bit returns int, at the current granularity of
+ * the bitmap (4KB per byte), this implementation "only" supports up to
+ * 1<<(32+12) == 16 TB...
+ */
+
+/*
+ * NOTE
+ *  Access to the *bm_pages is protected by bm_lock.
+ *  It is safe to read the other members within the lock.
+ *
+ *  drbd_bm_set_bits is called from bio_endio callbacks,
+ *  We may be called with irq already disabled,
+ *  so we need spin_lock_irqsave().
+ *  And we need the kmap_atomic.
+ */
+struct drbd_bitmap {
+	struct page **bm_pages;
+	spinlock_t bm_lock;
+	/* WARNING unsigned long bm_*:
+	 * 32bit number of bit offset is just enough for 512 MB bitmap.
+	 * it will blow up if we make the bitmap bigger...
+	 * not that it makes much sense to have a bitmap that large,
+	 * rather change the granularity to 16k or 64k or something.
+	 * (that implies other problems, however...)
+	 */
+	unsigned long bm_set;       /* nr of set bits; THINK maybe atomic_t? */
+	unsigned long bm_bits;
+	size_t   bm_words;
+	size_t   bm_number_of_pages;
+	sector_t bm_dev_capacity;
+	struct semaphore bm_change; /* serializes resize operations */
+
+	atomic_t bm_async_io;
+	wait_queue_head_t bm_io_wait;
+
+	unsigned long  bm_flags;
+
+	/* debugging aid, in case we are still racy somewhere */
+	char          *bm_why;
+	struct task_struct *bm_task;
+};
+
+/* definition of bits in bm_flags */
+#define BM_LOCKED       0
+#define BM_MD_IO_ERROR  1
+#define BM_P_VMALLOCED  2
+
+static int bm_is_locked(struct drbd_bitmap *b)
+{
+	return test_bit(BM_LOCKED, &b->bm_flags);
+}
+
+#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
+static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	if (!__ratelimit(&drbd_ratelimit_state))
+		return;
+	dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n",
+	    current == mdev->receiver.task ? "receiver" :
+	    current == mdev->asender.task  ? "asender"  :
+	    current == mdev->worker.task   ? "worker"   : current->comm,
+	    func, b->bm_why ?: "?",
+	    b->bm_task == mdev->receiver.task ? "receiver" :
+	    b->bm_task == mdev->asender.task  ? "asender"  :
+	    b->bm_task == mdev->worker.task   ? "worker"   : "?");
+}
+
+void drbd_bm_lock(struct drbd_conf *mdev, char *why)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	int trylock_failed;
+
+	if (!b) {
+		dev_err(DEV, "FIXME no bitmap in drbd_bm_lock!?\n");
+		return;
+	}
+
+	trylock_failed = down_trylock(&b->bm_change);
+
+	if (trylock_failed) {
+		dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n",
+		    current == mdev->receiver.task ? "receiver" :
+		    current == mdev->asender.task  ? "asender"  :
+		    current == mdev->worker.task   ? "worker"   : current->comm,
+		    why, b->bm_why ?: "?",
+		    b->bm_task == mdev->receiver.task ? "receiver" :
+		    b->bm_task == mdev->asender.task  ? "asender"  :
+		    b->bm_task == mdev->worker.task   ? "worker"   : "?");
+		down(&b->bm_change);
+	}
+	if (__test_and_set_bit(BM_LOCKED, &b->bm_flags))
+		dev_err(DEV, "FIXME bitmap already locked in bm_lock\n");
+
+	b->bm_why  = why;
+	b->bm_task = current;
+}
+
+void drbd_bm_unlock(struct drbd_conf *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	if (!b) {
+		dev_err(DEV, "FIXME no bitmap in drbd_bm_unlock!?\n");
+		return;
+	}
+
+	if (!__test_and_clear_bit(BM_LOCKED, &mdev->bitmap->bm_flags))
+		dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n");
+
+	b->bm_why  = NULL;
+	b->bm_task = NULL;
+	up(&b->bm_change);
+}
+
+/* word offset to long pointer */
+static unsigned long *__bm_map_paddr(struct drbd_bitmap *b, unsigned long offset, const enum km_type km)
+{
+	struct page *page;
+	unsigned long page_nr;
+
+	/* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
+	page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3);
+	BUG_ON(page_nr >= b->bm_number_of_pages);
+	page = b->bm_pages[page_nr];
+
+	return (unsigned long *) kmap_atomic(page, km);
+}
+
+static unsigned long * bm_map_paddr(struct drbd_bitmap *b, unsigned long offset)
+{
+	return __bm_map_paddr(b, offset, KM_IRQ1);
+}
+
+static void __bm_unmap(unsigned long *p_addr, const enum km_type km)
+{
+	kunmap_atomic(p_addr, km);
+};
+
+static void bm_unmap(unsigned long *p_addr)
+{
+	return __bm_unmap(p_addr, KM_IRQ1);
+}
+
+/* long word offset of _bitmap_ sector */
+#define S2W(s)	((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
+/* word offset from start of bitmap to word number _in_page_
+ * modulo longs per page
+#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
+ hm, well, Philipp thinks gcc might not optimze the % into & (... - 1)
+ so do it explicitly:
+ */
+#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
+
+/* Long words per page */
+#define LWPP (PAGE_SIZE/sizeof(long))
+
+/*
+ * actually most functions herein should take a struct drbd_bitmap*, not a
+ * struct drbd_conf*, but for the debug macros I like to have the mdev around
+ * to be able to report device specific.
+ */
+
+static void bm_free_pages(struct page **pages, unsigned long number)
+{
+	unsigned long i;
+	if (!pages)
+		return;
+
+	for (i = 0; i < number; i++) {
+		if (!pages[i]) {
+			printk(KERN_ALERT "drbd: bm_free_pages tried to free "
+					  "a NULL pointer; i=%lu n=%lu\n",
+					  i, number);
+			continue;
+		}
+		__free_page(pages[i]);
+		pages[i] = NULL;
+	}
+}
+
+static void bm_vk_free(void *ptr, int v)
+{
+	if (v)
+		vfree(ptr);
+	else
+		kfree(ptr);
+}
+
+/*
+ * "have" and "want" are NUMBER OF PAGES.
+ */
+static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
+{
+	struct page **old_pages = b->bm_pages;
+	struct page **new_pages, *page;
+	unsigned int i, bytes, vmalloced = 0;
+	unsigned long have = b->bm_number_of_pages;
+
+	BUG_ON(have == 0 && old_pages != NULL);
+	BUG_ON(have != 0 && old_pages == NULL);
+
+	if (have == want)
+		return old_pages;
+
+	/* Trying kmalloc first, falling back to vmalloc.
+	 * GFP_KERNEL is ok, as this is done when a lower level disk is
+	 * "attached" to the drbd.  Context is receiver thread or cqueue
+	 * thread.  As we have no disk yet, we are not in the IO path,
+	 * not even the IO path of the peer. */
+	bytes = sizeof(struct page *)*want;
+	new_pages = kmalloc(bytes, GFP_KERNEL);
+	if (!new_pages) {
+		new_pages = vmalloc(bytes);
+		if (!new_pages)
+			return NULL;
+		vmalloced = 1;
+	}
+
+	memset(new_pages, 0, bytes);
+	if (want >= have) {
+		for (i = 0; i < have; i++)
+			new_pages[i] = old_pages[i];
+		for (; i < want; i++) {
+			page = alloc_page(GFP_HIGHUSER);
+			if (!page) {
+				bm_free_pages(new_pages + have, i - have);
+				bm_vk_free(new_pages, vmalloced);
+				return NULL;
+			}
+			new_pages[i] = page;
+		}
+	} else {
+		for (i = 0; i < want; i++)
+			new_pages[i] = old_pages[i];
+		/* NOT HERE, we are outside the spinlock!
+		bm_free_pages(old_pages + want, have - want);
+		*/
+	}
+
+	if (vmalloced)
+		set_bit(BM_P_VMALLOCED, &b->bm_flags);
+	else
+		clear_bit(BM_P_VMALLOCED, &b->bm_flags);
+
+	return new_pages;
+}
+
+/*
+ * called on driver init only. TODO call when a device is created.
+ * allocates the drbd_bitmap, and stores it in mdev->bitmap.
+ */
+int drbd_bm_init(struct drbd_conf *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	WARN_ON(b != NULL);
+	b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL);
+	if (!b)
+		return -ENOMEM;
+	spin_lock_init(&b->bm_lock);
+	init_MUTEX(&b->bm_change);
+	init_waitqueue_head(&b->bm_io_wait);
+
+	mdev->bitmap = b;
+
+	return 0;
+}
+
+sector_t drbd_bm_capacity(struct drbd_conf *mdev)
+{
+	ERR_IF(!mdev->bitmap) return 0;
+	return mdev->bitmap->bm_dev_capacity;
+}
+
+/* called on driver unload. TODO: call when a device is destroyed.
+ */
+void drbd_bm_cleanup(struct drbd_conf *mdev)
+{
+	ERR_IF (!mdev->bitmap) return;
+	bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);
+	bm_vk_free(mdev->bitmap->bm_pages, test_bit(BM_P_VMALLOCED, &mdev->bitmap->bm_flags));
+	kfree(mdev->bitmap);
+	mdev->bitmap = NULL;
+}
+
+/*
+ * since (b->bm_bits % BITS_PER_LONG) != 0,
+ * this masks out the remaining bits.
+ * Returns the number of bits cleared.
+ */
+static int bm_clear_surplus(struct drbd_bitmap *b)
+{
+	const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1;
+	size_t w = b->bm_bits >> LN2_BPL;
+	int cleared = 0;
+	unsigned long *p_addr, *bm;
+
+	p_addr = bm_map_paddr(b, w);
+	bm = p_addr + MLPP(w);
+	if (w < b->bm_words) {
+		cleared = hweight_long(*bm & ~mask);
+		*bm &= mask;
+		w++; bm++;
+	}
+
+	if (w < b->bm_words) {
+		cleared += hweight_long(*bm);
+		*bm = 0;
+	}
+	bm_unmap(p_addr);
+	return cleared;
+}
+
+static void bm_set_surplus(struct drbd_bitmap *b)
+{
+	const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1;
+	size_t w = b->bm_bits >> LN2_BPL;
+	unsigned long *p_addr, *bm;
+
+	p_addr = bm_map_paddr(b, w);
+	bm = p_addr + MLPP(w);
+	if (w < b->bm_words) {
+		*bm |= ~mask;
+		bm++; w++;
+	}
+
+	if (w < b->bm_words) {
+		*bm = ~(0UL);
+	}
+	bm_unmap(p_addr);
+}
+
+static unsigned long __bm_count_bits(struct drbd_bitmap *b, const int swap_endian)
+{
+	unsigned long *p_addr, *bm, offset = 0;
+	unsigned long bits = 0;
+	unsigned long i, do_now;
+
+	while (offset < b->bm_words) {
+		i = do_now = min_t(size_t, b->bm_words-offset, LWPP);
+		p_addr = __bm_map_paddr(b, offset, KM_USER0);
+		bm = p_addr + MLPP(offset);
+		while (i--) {
+#ifndef __LITTLE_ENDIAN
+			if (swap_endian)
+				*bm = lel_to_cpu(*bm);
+#endif
+			bits += hweight_long(*bm++);
+		}
+		__bm_unmap(p_addr, KM_USER0);
+		offset += do_now;
+		cond_resched();
+	}
+
+	return bits;
+}
+
+static unsigned long bm_count_bits(struct drbd_bitmap *b)
+{
+	return __bm_count_bits(b, 0);
+}
+
+static unsigned long bm_count_bits_swap_endian(struct drbd_bitmap *b)
+{
+	return __bm_count_bits(b, 1);
+}
+
+/* offset and len in long words.*/
+static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
+{
+	unsigned long *p_addr, *bm;
+	size_t do_now, end;
+
+#define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512)
+
+	end = offset + len;
+
+	if (end > b->bm_words) {
+		printk(KERN_ALERT "drbd: bm_memset end > bm_words\n");
+		return;
+	}
+
+	while (offset < end) {
+		do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
+		p_addr = bm_map_paddr(b, offset);
+		bm = p_addr + MLPP(offset);
+		if (bm+do_now > p_addr + LWPP) {
+			printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
+			       p_addr, bm, (int)do_now);
+			break; /* breaks to after catch_oob_access_end() only! */
+		}
+		memset(bm, c, do_now * sizeof(long));
+		bm_unmap(p_addr);
+		offset += do_now;
+	}
+}
+
+/*
+ * make sure the bitmap has enough room for the attached storage,
+ * if necessary, resize.
+ * called whenever we may have changed the device size.
+ * returns -ENOMEM if we could not allocate enough memory, 0 on success.
+ * In case this is actually a resize, we copy the old bitmap into the new one.
+ * Otherwise, the bitmap is initialized to all bits set.
+ */
+int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long bits, words, owords, obits, *p_addr, *bm;
+	unsigned long want, have, onpages; /* number of pages */
+	struct page **npages, **opages = NULL;
+	int err = 0, growing;
+	int opages_vmalloced;
+
+	ERR_IF(!b) return -ENOMEM;
+
+	drbd_bm_lock(mdev, "resize");
+
+	dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n",
+			(unsigned long long)capacity);
+
+	if (capacity == b->bm_dev_capacity)
+		goto out;
+
+	opages_vmalloced = test_bit(BM_P_VMALLOCED, &b->bm_flags);
+
+	if (capacity == 0) {
+		spin_lock_irq(&b->bm_lock);
+		opages = b->bm_pages;
+		onpages = b->bm_number_of_pages;
+		owords = b->bm_words;
+		b->bm_pages = NULL;
+		b->bm_number_of_pages =
+		b->bm_set   =
+		b->bm_bits  =
+		b->bm_words =
+		b->bm_dev_capacity = 0;
+		spin_unlock_irq(&b->bm_lock);
+		bm_free_pages(opages, onpages);
+		bm_vk_free(opages, opages_vmalloced);
+		goto out;
+	}
+	bits  = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
+
+	/* if we would use
+	   words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
+	   a 32bit host could present the wrong number of words
+	   to a 64bit host.
+	*/
+	words = ALIGN(bits, 64) >> LN2_BPL;
+
+	if (get_ldev(mdev)) {
+		D_ASSERT((u64)bits <= (((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12));
+		put_ldev(mdev);
+	}
+
+	/* one extra long to catch off by one errors */
+	want = ALIGN((words+1)*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
+	have = b->bm_number_of_pages;
+	if (want == have) {
+		D_ASSERT(b->bm_pages != NULL);
+		npages = b->bm_pages;
+	} else {
+		if (FAULT_ACTIVE(mdev, DRBD_FAULT_BM_ALLOC))
+			npages = NULL;
+		else
+			npages = bm_realloc_pages(b, want);
+	}
+
+	if (!npages) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	spin_lock_irq(&b->bm_lock);
+	opages = b->bm_pages;
+	owords = b->bm_words;
+	obits  = b->bm_bits;
+
+	growing = bits > obits;
+	if (opages)
+		bm_set_surplus(b);
+
+	b->bm_pages = npages;
+	b->bm_number_of_pages = want;
+	b->bm_bits  = bits;
+	b->bm_words = words;
+	b->bm_dev_capacity = capacity;
+
+	if (growing) {
+		bm_memset(b, owords, 0xff, words-owords);
+		b->bm_set += bits - obits;
+	}
+
+	if (want < have) {
+		/* implicit: (opages != NULL) && (opages != npages) */
+		bm_free_pages(opages + want, have - want);
+	}
+
+	p_addr = bm_map_paddr(b, words);
+	bm = p_addr + MLPP(words);
+	*bm = DRBD_MAGIC;
+	bm_unmap(p_addr);
+
+	(void)bm_clear_surplus(b);
+
+	spin_unlock_irq(&b->bm_lock);
+	if (opages != npages)
+		bm_vk_free(opages, opages_vmalloced);
+	if (!growing)
+		b->bm_set = bm_count_bits(b);
+	dev_info(DEV, "resync bitmap: bits=%lu words=%lu\n", bits, words);
+
+ out:
+	drbd_bm_unlock(mdev);
+	return err;
+}
+
+/* inherently racy:
+ * if not protected by other means, return value may be out of date when
+ * leaving this function...
+ * we still need to lock it, since it is important that this returns
+ * bm_set == 0 precisely.
+ *
+ * maybe bm_set should be atomic_t ?
+ */
+static unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long s;
+	unsigned long flags;
+
+	ERR_IF(!b) return 0;
+	ERR_IF(!b->bm_pages) return 0;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	s = b->bm_set;
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+
+	return s;
+}
+
+unsigned long drbd_bm_total_weight(struct drbd_conf *mdev)
+{
+	unsigned long s;
+	/* if I don't have a disk, I don't know about out-of-sync status */
+	if (!get_ldev_if_state(mdev, D_NEGOTIATING))
+		return 0;
+	s = _drbd_bm_total_weight(mdev);
+	put_ldev(mdev);
+	return s;
+}
+
+size_t drbd_bm_words(struct drbd_conf *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	ERR_IF(!b) return 0;
+	ERR_IF(!b->bm_pages) return 0;
+
+	return b->bm_words;
+}
+
+unsigned long drbd_bm_bits(struct drbd_conf *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	ERR_IF(!b) return 0;
+
+	return b->bm_bits;
+}
+
+/* merge number words from buffer into the bitmap starting at offset.
+ * buffer[i] is expected to be little endian unsigned long.
+ * bitmap must be locked by drbd_bm_lock.
+ * currently only used from receive_bitmap.
+ */
+void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
+			unsigned long *buffer)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *p_addr, *bm;
+	unsigned long word, bits;
+	size_t end, do_now;
+
+	end = offset + number;
+
+	ERR_IF(!b) return;
+	ERR_IF(!b->bm_pages) return;
+	if (number == 0)
+		return;
+	WARN_ON(offset >= b->bm_words);
+	WARN_ON(end    >  b->bm_words);
+
+	spin_lock_irq(&b->bm_lock);
+	while (offset < end) {
+		do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
+		p_addr = bm_map_paddr(b, offset);
+		bm = p_addr + MLPP(offset);
+		offset += do_now;
+		while (do_now--) {
+			bits = hweight_long(*bm);
+			word = *bm | lel_to_cpu(*buffer++);
+			*bm++ = word;
+			b->bm_set += hweight_long(word) - bits;
+		}
+		bm_unmap(p_addr);
+	}
+	/* with 32bit <-> 64bit cross-platform connect
+	 * this is only correct for current usage,
+	 * where we _know_ that we are 64 bit aligned,
+	 * and know that this function is used in this way, too...
+	 */
+	if (end == b->bm_words)
+		b->bm_set -= bm_clear_surplus(b);
+
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* copy number words from the bitmap starting at offset into the buffer.
+ * buffer[i] will be little endian unsigned long.
+ */
+void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
+		     unsigned long *buffer)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *p_addr, *bm;
+	size_t end, do_now;
+
+	end = offset + number;
+
+	ERR_IF(!b) return;
+	ERR_IF(!b->bm_pages) return;
+
+	spin_lock_irq(&b->bm_lock);
+	if ((offset >= b->bm_words) ||
+	    (end    >  b->bm_words) ||
+	    (number <= 0))
+		dev_err(DEV, "offset=%lu number=%lu bm_words=%lu\n",
+			(unsigned long)	offset,
+			(unsigned long)	number,
+			(unsigned long) b->bm_words);
+	else {
+		while (offset < end) {
+			do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
+			p_addr = bm_map_paddr(b, offset);
+			bm = p_addr + MLPP(offset);
+			offset += do_now;
+			while (do_now--)
+				*buffer++ = cpu_to_lel(*bm++);
+			bm_unmap(p_addr);
+		}
+	}
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* set all bits in the bitmap */
+void drbd_bm_set_all(struct drbd_conf *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	ERR_IF(!b) return;
+	ERR_IF(!b->bm_pages) return;
+
+	spin_lock_irq(&b->bm_lock);
+	bm_memset(b, 0, 0xff, b->bm_words);
+	(void)bm_clear_surplus(b);
+	b->bm_set = b->bm_bits;
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* clear all bits in the bitmap */
+void drbd_bm_clear_all(struct drbd_conf *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	ERR_IF(!b) return;
+	ERR_IF(!b->bm_pages) return;
+
+	spin_lock_irq(&b->bm_lock);
+	bm_memset(b, 0, 0, b->bm_words);
+	b->bm_set = 0;
+	spin_unlock_irq(&b->bm_lock);
+}
+
+static void bm_async_io_complete(struct bio *bio, int error)
+{
+	struct drbd_bitmap *b = bio->bi_private;
+	int uptodate = bio_flagged(bio, BIO_UPTODATE);
+
+
+	/* strange behavior of some lower level drivers...
+	 * fail the request by clearing the uptodate flag,
+	 * but do not return any error?!
+	 * do we want to WARN() on this? */
+	if (!error && !uptodate)
+		error = -EIO;
+
+	if (error) {
+		/* doh. what now?
+		 * for now, set all bits, and flag MD_IO_ERROR */
+		__set_bit(BM_MD_IO_ERROR, &b->bm_flags);
+	}
+	if (atomic_dec_and_test(&b->bm_async_io))
+		wake_up(&b->bm_io_wait);
+
+	bio_put(bio);
+}
+
+static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int page_nr, int rw) __must_hold(local)
+{
+	/* we are process context. we always get a bio */
+	struct bio *bio = bio_alloc(GFP_KERNEL, 1);
+	unsigned int len;
+	sector_t on_disk_sector =
+		mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset;
+	on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
+
+	/* this might happen with very small
+	 * flexible external meta data device */
+	len = min_t(unsigned int, PAGE_SIZE,
+		(drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9);
+
+	bio->bi_bdev = mdev->ldev->md_bdev;
+	bio->bi_sector = on_disk_sector;
+	bio_add_page(bio, b->bm_pages[page_nr], len, 0);
+	bio->bi_private = b;
+	bio->bi_end_io = bm_async_io_complete;
+
+	if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
+		bio->bi_rw |= rw;
+		bio_endio(bio, -EIO);
+	} else {
+		submit_bio(rw, bio);
+	}
+}
+
+# if defined(__LITTLE_ENDIAN)
+	/* nothing to do, on disk == in memory */
+# define bm_cpu_to_lel(x) ((void)0)
+# else
+void bm_cpu_to_lel(struct drbd_bitmap *b)
+{
+	/* need to cpu_to_lel all the pages ...
+	 * this may be optimized by using
+	 * cpu_to_lel(-1) == -1 and cpu_to_lel(0) == 0;
+	 * the following is still not optimal, but better than nothing */
+	unsigned int i;
+	unsigned long *p_addr, *bm;
+	if (b->bm_set == 0) {
+		/* no page at all; avoid swap if all is 0 */
+		i = b->bm_number_of_pages;
+	} else if (b->bm_set == b->bm_bits) {
+		/* only the last page */
+		i = b->bm_number_of_pages - 1;
+	} else {
+		/* all pages */
+		i = 0;
+	}
+	for (; i < b->bm_number_of_pages; i++) {
+		p_addr = kmap_atomic(b->bm_pages[i], KM_USER0);
+		for (bm = p_addr; bm < p_addr + PAGE_SIZE/sizeof(long); bm++)
+			*bm = cpu_to_lel(*bm);
+		kunmap_atomic(p_addr, KM_USER0);
+	}
+}
+# endif
+/* lel_to_cpu == cpu_to_lel */
+# define bm_lel_to_cpu(x) bm_cpu_to_lel(x)
+
+/*
+ * bm_rw: read/write the whole bitmap from/to its on disk location.
+ */
+static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	/* sector_t sector; */
+	int bm_words, num_pages, i;
+	unsigned long now;
+	char ppb[10];
+	int err = 0;
+
+	WARN_ON(!bm_is_locked(b));
+
+	/* no spinlock here, the drbd_bm_lock should be enough! */
+
+	bm_words  = drbd_bm_words(mdev);
+	num_pages = (bm_words*sizeof(long) + PAGE_SIZE-1) >> PAGE_SHIFT;
+
+	/* on disk bitmap is little endian */
+	if (rw == WRITE)
+		bm_cpu_to_lel(b);
+
+	now = jiffies;
+	atomic_set(&b->bm_async_io, num_pages);
+	__clear_bit(BM_MD_IO_ERROR, &b->bm_flags);
+
+	/* let the layers below us try to merge these bios... */
+	for (i = 0; i < num_pages; i++)
+		bm_page_io_async(mdev, b, i, rw);
+
+	drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
+	wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0);
+
+	if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) {
+		dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
+		drbd_chk_io_error(mdev, 1, TRUE);
+		err = -EIO;
+	}
+
+	now = jiffies;
+	if (rw == WRITE) {
+		/* swap back endianness */
+		bm_lel_to_cpu(b);
+		/* flush bitmap to stable storage */
+		drbd_md_flush(mdev);
+	} else /* rw == READ */ {
+		/* just read, if necessary adjust endianness */
+		b->bm_set = bm_count_bits_swap_endian(b);
+		dev_info(DEV, "recounting of set bits took additional %lu jiffies\n",
+		     jiffies - now);
+	}
+	now = b->bm_set;
+
+	dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
+	     ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
+
+	return err;
+}
+
+/**
+ * drbd_bm_read() - Read the whole bitmap from its on disk location.
+ * @mdev:	DRBD device.
+ */
+int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
+{
+	return bm_rw(mdev, READ);
+}
+
+/**
+ * drbd_bm_write() - Write the whole bitmap to its on disk location.
+ * @mdev:	DRBD device.
+ */
+int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
+{
+	return bm_rw(mdev, WRITE);
+}
+
+/**
+ * drbd_bm_write_sect: Writes a 512 (MD_SECTOR_SIZE) byte piece of the bitmap
+ * @mdev:	DRBD device.
+ * @enr:	Extent number in the resync lru (happens to be sector offset)
+ *
+ * The BM_EXT_SIZE is on purpose exactly the amount of the bitmap covered
+ * by a single sector write. Therefore enr == sector offset from the
+ * start of the bitmap.
+ */
+int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local)
+{
+	sector_t on_disk_sector = enr + mdev->ldev->md.md_offset
+				      + mdev->ldev->md.bm_offset;
+	int bm_words, num_words, offset;
+	int err = 0;
+
+	mutex_lock(&mdev->md_io_mutex);
+	bm_words  = drbd_bm_words(mdev);
+	offset    = S2W(enr);	/* word offset into bitmap */
+	num_words = min(S2W(1), bm_words - offset);
+	if (num_words < S2W(1))
+		memset(page_address(mdev->md_io_page), 0, MD_SECTOR_SIZE);
+	drbd_bm_get_lel(mdev, offset, num_words,
+			page_address(mdev->md_io_page));
+	if (!drbd_md_sync_page_io(mdev, mdev->ldev, on_disk_sector, WRITE)) {
+		int i;
+		err = -EIO;
+		dev_err(DEV, "IO ERROR writing bitmap sector %lu "
+		    "(meta-disk sector %llus)\n",
+		    enr, (unsigned long long)on_disk_sector);
+		drbd_chk_io_error(mdev, 1, TRUE);
+		for (i = 0; i < AL_EXT_PER_BM_SECT; i++)
+			drbd_bm_ALe_set_all(mdev, enr*AL_EXT_PER_BM_SECT+i);
+	}
+	mdev->bm_writ_cnt++;
+	mutex_unlock(&mdev->md_io_mutex);
+	return err;
+}
+
+/* NOTE
+ * find_first_bit returns int, we return unsigned long.
+ * should not make much difference anyways, but ...
+ *
+ * this returns a bit number, NOT a sector!
+ */
+#define BPP_MASK ((1UL << (PAGE_SHIFT+3)) - 1)
+static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
+	const int find_zero_bit, const enum km_type km)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long i = -1UL;
+	unsigned long *p_addr;
+	unsigned long bit_offset; /* bit offset of the mapped page. */
+
+	if (bm_fo > b->bm_bits) {
+		dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
+	} else {
+		while (bm_fo < b->bm_bits) {
+			unsigned long offset;
+			bit_offset = bm_fo & ~BPP_MASK; /* bit offset of the page */
+			offset = bit_offset >> LN2_BPL;    /* word offset of the page */
+			p_addr = __bm_map_paddr(b, offset, km);
+
+			if (find_zero_bit)
+				i = find_next_zero_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK);
+			else
+				i = find_next_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK);
+
+			__bm_unmap(p_addr, km);
+			if (i < PAGE_SIZE*8) {
+				i = bit_offset + i;
+				if (i >= b->bm_bits)
+					break;
+				goto found;
+			}
+			bm_fo = bit_offset + PAGE_SIZE*8;
+		}
+		i = -1UL;
+	}
+ found:
+	return i;
+}
+
+static unsigned long bm_find_next(struct drbd_conf *mdev,
+	unsigned long bm_fo, const int find_zero_bit)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long i = -1UL;
+
+	ERR_IF(!b) return i;
+	ERR_IF(!b->bm_pages) return i;
+
+	spin_lock_irq(&b->bm_lock);
+	if (bm_is_locked(b))
+		bm_print_lock_info(mdev);
+
+	i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1);
+
+	spin_unlock_irq(&b->bm_lock);
+	return i;
+}
+
+unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
+{
+	return bm_find_next(mdev, bm_fo, 0);
+}
+
+#if 0
+/* not yet needed for anything. */
+unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
+{
+	return bm_find_next(mdev, bm_fo, 1);
+}
+#endif
+
+/* does not spin_lock_irqsave.
+ * you must take drbd_bm_lock() first */
+unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
+{
+	/* WARN_ON(!bm_is_locked(mdev)); */
+	return __bm_find_next(mdev, bm_fo, 0, KM_USER1);
+}
+
+unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
+{
+	/* WARN_ON(!bm_is_locked(mdev)); */
+	return __bm_find_next(mdev, bm_fo, 1, KM_USER1);
+}
+
+/* returns number of bits actually changed.
+ * for val != 0, we change 0 -> 1, return code positive
+ * for val == 0, we change 1 -> 0, return code negative
+ * wants bitnr, not sector.
+ * expected to be called for only a few bits (e - s about BITS_PER_LONG).
+ * Must hold bitmap lock already. */
+int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
+	unsigned long e, int val, const enum km_type km)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *p_addr = NULL;
+	unsigned long bitnr;
+	unsigned long last_page_nr = -1UL;
+	int c = 0;
+
+	if (e >= b->bm_bits) {
+		dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
+				s, e, b->bm_bits);
+		e = b->bm_bits ? b->bm_bits -1 : 0;
+	}
+	for (bitnr = s; bitnr <= e; bitnr++) {
+		unsigned long offset = bitnr>>LN2_BPL;
+		unsigned long page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3);
+		if (page_nr != last_page_nr) {
+			if (p_addr)
+				__bm_unmap(p_addr, km);
+			p_addr = __bm_map_paddr(b, offset, km);
+			last_page_nr = page_nr;
+		}
+		if (val)
+			c += (0 == __test_and_set_bit(bitnr & BPP_MASK, p_addr));
+		else
+			c -= (0 != __test_and_clear_bit(bitnr & BPP_MASK, p_addr));
+	}
+	if (p_addr)
+		__bm_unmap(p_addr, km);
+	b->bm_set += c;
+	return c;
+}
+
+/* returns number of bits actually changed.
+ * for val != 0, we change 0 -> 1, return code positive
+ * for val == 0, we change 1 -> 0, return code negative
+ * wants bitnr, not sector */
+int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
+	const unsigned long e, int val)
+{
+	unsigned long flags;
+	struct drbd_bitmap *b = mdev->bitmap;
+	int c = 0;
+
+	ERR_IF(!b) return 1;
+	ERR_IF(!b->bm_pages) return 0;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	if (bm_is_locked(b))
+		bm_print_lock_info(mdev);
+
+	c = __bm_change_bits_to(mdev, s, e, val, KM_IRQ1);
+
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+	return c;
+}
+
+/* returns number of bits changed 0 -> 1 */
+int drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
+{
+	return bm_change_bits_to(mdev, s, e, 1);
+}
+
+/* returns number of bits changed 1 -> 0 */
+int drbd_bm_clear_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
+{
+	return -bm_change_bits_to(mdev, s, e, 0);
+}
+
+/* sets all bits in full words,
+ * from first_word up to, but not including, last_word */
+static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
+		int page_nr, int first_word, int last_word)
+{
+	int i;
+	int bits;
+	unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_USER0);
+	for (i = first_word; i < last_word; i++) {
+		bits = hweight_long(paddr[i]);
+		paddr[i] = ~0UL;
+		b->bm_set += BITS_PER_LONG - bits;
+	}
+	kunmap_atomic(paddr, KM_USER0);
+}
+
+/* Same thing as drbd_bm_set_bits, but without taking the spin_lock_irqsave.
+ * You must first drbd_bm_lock().
+ * Can be called to set the whole bitmap in one go.
+ * Sets bits from s to e _inclusive_. */
+void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
+{
+	/* First set_bit from the first bit (s)
+	 * up to the next long boundary (sl),
+	 * then assign full words up to the last long boundary (el),
+	 * then set_bit up to and including the last bit (e).
+	 *
+	 * Do not use memset, because we must account for changes,
+	 * so we need to loop over the words with hweight() anyways.
+	 */
+	unsigned long sl = ALIGN(s,BITS_PER_LONG);
+	unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1);
+	int first_page;
+	int last_page;
+	int page_nr;
+	int first_word;
+	int last_word;
+
+	if (e - s <= 3*BITS_PER_LONG) {
+		/* don't bother; el and sl may even be wrong. */
+		__bm_change_bits_to(mdev, s, e, 1, KM_USER0);
+		return;
+	}
+
+	/* difference is large enough that we can trust sl and el */
+
+	/* bits filling the current long */
+	if (sl)
+		__bm_change_bits_to(mdev, s, sl-1, 1, KM_USER0);
+
+	first_page = sl >> (3 + PAGE_SHIFT);
+	last_page = el >> (3 + PAGE_SHIFT);
+
+	/* MLPP: modulo longs per page */
+	/* LWPP: long words per page */
+	first_word = MLPP(sl >> LN2_BPL);
+	last_word = LWPP;
+
+	/* first and full pages, unless first page == last page */
+	for (page_nr = first_page; page_nr < last_page; page_nr++) {
+		bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word);
+		cond_resched();
+		first_word = 0;
+	}
+
+	/* last page (respectively only page, for first page == last page) */
+	last_word = MLPP(el >> LN2_BPL);
+	bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word);
+
+	/* possibly trailing bits.
+	 * example: (e & 63) == 63, el will be e+1.
+	 * if that even was the very last bit,
+	 * it would trigger an assert in __bm_change_bits_to()
+	 */
+	if (el <= e)
+		__bm_change_bits_to(mdev, el, e, 1, KM_USER0);
+}
+
+/* returns bit state
+ * wants bitnr, NOT sector.
+ * inherently racy... area needs to be locked by means of {al,rs}_lru
+ *  1 ... bit set
+ *  0 ... bit not set
+ * -1 ... first out of bounds access, stop testing for bits!
+ */
+int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr)
+{
+	unsigned long flags;
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *p_addr;
+	int i;
+
+	ERR_IF(!b) return 0;
+	ERR_IF(!b->bm_pages) return 0;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	if (bm_is_locked(b))
+		bm_print_lock_info(mdev);
+	if (bitnr < b->bm_bits) {
+		unsigned long offset = bitnr>>LN2_BPL;
+		p_addr = bm_map_paddr(b, offset);
+		i = test_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0;
+		bm_unmap(p_addr);
+	} else if (bitnr == b->bm_bits) {
+		i = -1;
+	} else { /* (bitnr > b->bm_bits) */
+		dev_err(DEV, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits);
+		i = 0;
+	}
+
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+	return i;
+}
+
+/* returns number of bits set in the range [s, e] */
+int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
+{
+	unsigned long flags;
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *p_addr = NULL, page_nr = -1;
+	unsigned long bitnr;
+	int c = 0;
+	size_t w;
+
+	/* If this is called without a bitmap, that is a bug.  But just to be
+	 * robust in case we screwed up elsewhere, in that case pretend there
+	 * was one dirty bit in the requested area, so we won't try to do a
+	 * local read there (no bitmap probably implies no disk) */
+	ERR_IF(!b) return 1;
+	ERR_IF(!b->bm_pages) return 1;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	if (bm_is_locked(b))
+		bm_print_lock_info(mdev);
+	for (bitnr = s; bitnr <= e; bitnr++) {
+		w = bitnr >> LN2_BPL;
+		if (page_nr != w >> (PAGE_SHIFT - LN2_BPL + 3)) {
+			page_nr = w >> (PAGE_SHIFT - LN2_BPL + 3);
+			if (p_addr)
+				bm_unmap(p_addr);
+			p_addr = bm_map_paddr(b, w);
+		}
+		ERR_IF (bitnr >= b->bm_bits) {
+			dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
+		} else {
+			c += (0 != test_bit(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
+		}
+	}
+	if (p_addr)
+		bm_unmap(p_addr);
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+	return c;
+}
+
+
+/* inherently racy...
+ * return value may be already out-of-date when this function returns.
+ * but the general usage is that this is only use during a cstate when bits are
+ * only cleared, not set, and typically only care for the case when the return
+ * value is zero, or we already "locked" this "bitmap extent" by other means.
+ *
+ * enr is bm-extent number, since we chose to name one sector (512 bytes)
+ * worth of the bitmap a "bitmap extent".
+ *
+ * TODO
+ * I think since we use it like a reference count, we should use the real
+ * reference count of some bitmap extent element from some lru instead...
+ *
+ */
+int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	int count, s, e;
+	unsigned long flags;
+	unsigned long *p_addr, *bm;
+
+	ERR_IF(!b) return 0;
+	ERR_IF(!b->bm_pages) return 0;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	if (bm_is_locked(b))
+		bm_print_lock_info(mdev);
+
+	s = S2W(enr);
+	e = min((size_t)S2W(enr+1), b->bm_words);
+	count = 0;
+	if (s < b->bm_words) {
+		int n = e-s;
+		p_addr = bm_map_paddr(b, s);
+		bm = p_addr + MLPP(s);
+		while (n--)
+			count += hweight_long(*bm++);
+		bm_unmap(p_addr);
+	} else {
+		dev_err(DEV, "start offset (%d) too large in drbd_bm_e_weight\n", s);
+	}
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+	return count;
+}
+
+/* set all bits covered by the AL-extent al_enr */
+unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *p_addr, *bm;
+	unsigned long weight;
+	int count, s, e, i, do_now;
+	ERR_IF(!b) return 0;
+	ERR_IF(!b->bm_pages) return 0;
+
+	spin_lock_irq(&b->bm_lock);
+	if (bm_is_locked(b))
+		bm_print_lock_info(mdev);
+	weight = b->bm_set;
+
+	s = al_enr * BM_WORDS_PER_AL_EXT;
+	e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words);
+	/* assert that s and e are on the same page */
+	D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3)
+	      ==  s    >> (PAGE_SHIFT - LN2_BPL + 3));
+	count = 0;
+	if (s < b->bm_words) {
+		i = do_now = e-s;
+		p_addr = bm_map_paddr(b, s);
+		bm = p_addr + MLPP(s);
+		while (i--) {
+			count += hweight_long(*bm);
+			*bm = -1UL;
+			bm++;
+		}
+		bm_unmap(p_addr);
+		b->bm_set += do_now*BITS_PER_LONG - count;
+		if (e == b->bm_words)
+			b->bm_set -= bm_clear_surplus(b);
+	} else {
+		dev_err(DEV, "start offset (%d) too large in drbd_bm_ALe_set_all\n", s);
+	}
+	weight = b->bm_set - weight;
+	spin_unlock_irq(&b->bm_lock);
+	return weight;
+}
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
new file mode 100644
index 000000000000..8da602e010bb
--- /dev/null
+++ b/drivers/block/drbd/drbd_int.h
@@ -0,0 +1,2258 @@
+/*
+  drbd_int.h
+
+  This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+  Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+  Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+  Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+  drbd is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2, or (at your option)
+  any later version.
+
+  drbd is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with drbd; see the file COPYING.  If not, write to
+  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#ifndef _DRBD_INT_H
+#define _DRBD_INT_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/version.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#include <linux/tcp.h>
+#include <linux/mutex.h>
+#include <linux/major.h>
+#include <linux/blkdev.h>
+#include <linux/genhd.h>
+#include <net/tcp.h>
+#include <linux/lru_cache.h>
+
+#ifdef __CHECKER__
+# define __protected_by(x)       __attribute__((require_context(x,1,999,"rdwr")))
+# define __protected_read_by(x)  __attribute__((require_context(x,1,999,"read")))
+# define __protected_write_by(x) __attribute__((require_context(x,1,999,"write")))
+# define __must_hold(x)       __attribute__((context(x,1,1), require_context(x,1,999,"call")))
+#else
+# define __protected_by(x)
+# define __protected_read_by(x)
+# define __protected_write_by(x)
+# define __must_hold(x)
+#endif
+
+#define __no_warn(lock, stmt) do { __acquire(lock); stmt; __release(lock); } while (0)
+
+/* module parameter, defined in drbd_main.c */
+extern unsigned int minor_count;
+extern int disable_sendpage;
+extern int allow_oos;
+extern unsigned int cn_idx;
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+extern int enable_faults;
+extern int fault_rate;
+extern int fault_devs;
+#endif
+
+extern char usermode_helper[];
+
+
+#ifndef TRUE
+#define TRUE 1
+#endif
+#ifndef FALSE
+#define FALSE 0
+#endif
+
+/* I don't remember why XCPU ...
+ * This is used to wake the asender,
+ * and to interrupt sending the sending task
+ * on disconnect.
+ */
+#define DRBD_SIG SIGXCPU
+
+/* This is used to stop/restart our threads.
+ * Cannot use SIGTERM nor SIGKILL, since these
+ * are sent out by init on runlevel changes
+ * I choose SIGHUP for now.
+ */
+#define DRBD_SIGKILL SIGHUP
+
+/* All EEs on the free list should have ID_VACANT (== 0)
+ * freshly allocated EEs get !ID_VACANT (== 1)
+ * so if it says "cannot dereference null pointer at adress 0x00000001",
+ * it is most likely one of these :( */
+
+#define ID_IN_SYNC      (4711ULL)
+#define ID_OUT_OF_SYNC  (4712ULL)
+
+#define ID_SYNCER (-1ULL)
+#define ID_VACANT 0
+#define is_syncer_block_id(id) ((id) == ID_SYNCER)
+
+struct drbd_conf;
+
+
+/* to shorten dev_warn(DEV, "msg"); and relatives statements */
+#define DEV (disk_to_dev(mdev->vdisk))
+
+#define D_ASSERT(exp)	if (!(exp)) \
+	 dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__)
+
+#define ERR_IF(exp) if (({				\
+	int _b = (exp) != 0;				\
+	if (_b) dev_err(DEV, "%s: (%s) in %s:%d\n",	\
+		__func__, #exp, __FILE__, __LINE__);	\
+	 _b;						\
+	}))
+
+/* Defines to control fault insertion */
+enum {
+	DRBD_FAULT_MD_WR = 0,	/* meta data write */
+	DRBD_FAULT_MD_RD = 1,	/*           read  */
+	DRBD_FAULT_RS_WR = 2,	/* resync          */
+	DRBD_FAULT_RS_RD = 3,
+	DRBD_FAULT_DT_WR = 4,	/* data            */
+	DRBD_FAULT_DT_RD = 5,
+	DRBD_FAULT_DT_RA = 6,	/* data read ahead */
+	DRBD_FAULT_BM_ALLOC = 7,	/* bitmap allocation */
+	DRBD_FAULT_AL_EE = 8,	/* alloc ee */
+
+	DRBD_FAULT_MAX,
+};
+
+extern void trace_drbd_resync(struct drbd_conf *mdev, int level, const char *fmt, ...);
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+extern unsigned int
+_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type);
+static inline int
+drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) {
+	return fault_rate &&
+		(enable_faults & (1<<type)) &&
+		_drbd_insert_fault(mdev, type);
+}
+#define FAULT_ACTIVE(_m, _t) (drbd_insert_fault((_m), (_t)))
+
+#else
+#define FAULT_ACTIVE(_m, _t) (0)
+#endif
+
+/* integer division, round _UP_ to the next integer */
+#define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0))
+/* usual integer division */
+#define div_floor(A, B) ((A)/(B))
+
+/* drbd_meta-data.c (still in drbd_main.c) */
+/* 4th incarnation of the disk layout. */
+#define DRBD_MD_MAGIC (DRBD_MAGIC+4)
+
+extern struct drbd_conf **minor_table;
+extern struct ratelimit_state drbd_ratelimit_state;
+
+/* on the wire */
+enum drbd_packets {
+	/* receiver (data socket) */
+	P_DATA		      = 0x00,
+	P_DATA_REPLY	      = 0x01, /* Response to P_DATA_REQUEST */
+	P_RS_DATA_REPLY	      = 0x02, /* Response to P_RS_DATA_REQUEST */
+	P_BARRIER	      = 0x03,
+	P_BITMAP	      = 0x04,
+	P_BECOME_SYNC_TARGET  = 0x05,
+	P_BECOME_SYNC_SOURCE  = 0x06,
+	P_UNPLUG_REMOTE	      = 0x07, /* Used at various times to hint the peer */
+	P_DATA_REQUEST	      = 0x08, /* Used to ask for a data block */
+	P_RS_DATA_REQUEST     = 0x09, /* Used to ask for a data block for resync */
+	P_SYNC_PARAM	      = 0x0a,
+	P_PROTOCOL	      = 0x0b,
+	P_UUIDS		      = 0x0c,
+	P_SIZES		      = 0x0d,
+	P_STATE		      = 0x0e,
+	P_SYNC_UUID	      = 0x0f,
+	P_AUTH_CHALLENGE      = 0x10,
+	P_AUTH_RESPONSE	      = 0x11,
+	P_STATE_CHG_REQ	      = 0x12,
+
+	/* asender (meta socket */
+	P_PING		      = 0x13,
+	P_PING_ACK	      = 0x14,
+	P_RECV_ACK	      = 0x15, /* Used in protocol B */
+	P_WRITE_ACK	      = 0x16, /* Used in protocol C */
+	P_RS_WRITE_ACK	      = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */
+	P_DISCARD_ACK	      = 0x18, /* Used in proto C, two-primaries conflict detection */
+	P_NEG_ACK	      = 0x19, /* Sent if local disk is unusable */
+	P_NEG_DREPLY	      = 0x1a, /* Local disk is broken... */
+	P_NEG_RS_DREPLY	      = 0x1b, /* Local disk is broken... */
+	P_BARRIER_ACK	      = 0x1c,
+	P_STATE_CHG_REPLY     = 0x1d,
+
+	/* "new" commands, no longer fitting into the ordering scheme above */
+
+	P_OV_REQUEST	      = 0x1e, /* data socket */
+	P_OV_REPLY	      = 0x1f,
+	P_OV_RESULT	      = 0x20, /* meta socket */
+	P_CSUM_RS_REQUEST     = 0x21, /* data socket */
+	P_RS_IS_IN_SYNC	      = 0x22, /* meta socket */
+	P_SYNC_PARAM89	      = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */
+	P_COMPRESSED_BITMAP   = 0x24, /* compressed or otherwise encoded bitmap transfer */
+
+	P_MAX_CMD	      = 0x25,
+	P_MAY_IGNORE	      = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
+	P_MAX_OPT_CMD	      = 0x101,
+
+	/* special command ids for handshake */
+
+	P_HAND_SHAKE_M	      = 0xfff1, /* First Packet on the MetaSock */
+	P_HAND_SHAKE_S	      = 0xfff2, /* First Packet on the Socket */
+
+	P_HAND_SHAKE	      = 0xfffe	/* FIXED for the next century! */
+};
+
+static inline const char *cmdname(enum drbd_packets cmd)
+{
+	/* THINK may need to become several global tables
+	 * when we want to support more than
+	 * one PRO_VERSION */
+	static const char *cmdnames[] = {
+		[P_DATA]	        = "Data",
+		[P_DATA_REPLY]	        = "DataReply",
+		[P_RS_DATA_REPLY]	= "RSDataReply",
+		[P_BARRIER]	        = "Barrier",
+		[P_BITMAP]	        = "ReportBitMap",
+		[P_BECOME_SYNC_TARGET]  = "BecomeSyncTarget",
+		[P_BECOME_SYNC_SOURCE]  = "BecomeSyncSource",
+		[P_UNPLUG_REMOTE]	= "UnplugRemote",
+		[P_DATA_REQUEST]	= "DataRequest",
+		[P_RS_DATA_REQUEST]     = "RSDataRequest",
+		[P_SYNC_PARAM]	        = "SyncParam",
+		[P_SYNC_PARAM89]	= "SyncParam89",
+		[P_PROTOCOL]            = "ReportProtocol",
+		[P_UUIDS]	        = "ReportUUIDs",
+		[P_SIZES]	        = "ReportSizes",
+		[P_STATE]	        = "ReportState",
+		[P_SYNC_UUID]           = "ReportSyncUUID",
+		[P_AUTH_CHALLENGE]      = "AuthChallenge",
+		[P_AUTH_RESPONSE]	= "AuthResponse",
+		[P_PING]		= "Ping",
+		[P_PING_ACK]	        = "PingAck",
+		[P_RECV_ACK]	        = "RecvAck",
+		[P_WRITE_ACK]	        = "WriteAck",
+		[P_RS_WRITE_ACK]	= "RSWriteAck",
+		[P_DISCARD_ACK]	        = "DiscardAck",
+		[P_NEG_ACK]	        = "NegAck",
+		[P_NEG_DREPLY]	        = "NegDReply",
+		[P_NEG_RS_DREPLY]	= "NegRSDReply",
+		[P_BARRIER_ACK]	        = "BarrierAck",
+		[P_STATE_CHG_REQ]       = "StateChgRequest",
+		[P_STATE_CHG_REPLY]     = "StateChgReply",
+		[P_OV_REQUEST]          = "OVRequest",
+		[P_OV_REPLY]            = "OVReply",
+		[P_OV_RESULT]           = "OVResult",
+		[P_MAX_CMD]	        = NULL,
+	};
+
+	if (cmd == P_HAND_SHAKE_M)
+		return "HandShakeM";
+	if (cmd == P_HAND_SHAKE_S)
+		return "HandShakeS";
+	if (cmd == P_HAND_SHAKE)
+		return "HandShake";
+	if (cmd >= P_MAX_CMD)
+		return "Unknown";
+	return cmdnames[cmd];
+}
+
+/* for sending/receiving the bitmap,
+ * possibly in some encoding scheme */
+struct bm_xfer_ctx {
+	/* "const"
+	 * stores total bits and long words
+	 * of the bitmap, so we don't need to
+	 * call the accessor functions over and again. */
+	unsigned long bm_bits;
+	unsigned long bm_words;
+	/* during xfer, current position within the bitmap */
+	unsigned long bit_offset;
+	unsigned long word_offset;
+
+	/* statistics; index: (h->command == P_BITMAP) */
+	unsigned packets[2];
+	unsigned bytes[2];
+};
+
+extern void INFO_bm_xfer_stats(struct drbd_conf *mdev,
+		const char *direction, struct bm_xfer_ctx *c);
+
+static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c)
+{
+	/* word_offset counts "native long words" (32 or 64 bit),
+	 * aligned at 64 bit.
+	 * Encoded packet may end at an unaligned bit offset.
+	 * In case a fallback clear text packet is transmitted in
+	 * between, we adjust this offset back to the last 64bit
+	 * aligned "native long word", which makes coding and decoding
+	 * the plain text bitmap much more convenient.  */
+#if BITS_PER_LONG == 64
+	c->word_offset = c->bit_offset >> 6;
+#elif BITS_PER_LONG == 32
+	c->word_offset = c->bit_offset >> 5;
+	c->word_offset &= ~(1UL);
+#else
+# error "unsupported BITS_PER_LONG"
+#endif
+}
+
+#ifndef __packed
+#define __packed __attribute__((packed))
+#endif
+
+/* This is the layout for a packet on the wire.
+ * The byteorder is the network byte order.
+ *     (except block_id and barrier fields.
+ *	these are pointers to local structs
+ *	and have no relevance for the partner,
+ *	which just echoes them as received.)
+ *
+ * NOTE that the payload starts at a long aligned offset,
+ * regardless of 32 or 64 bit arch!
+ */
+struct p_header {
+	u32	  magic;
+	u16	  command;
+	u16	  length;	/* bytes of data after this header */
+	u8	  payload[0];
+} __packed;
+/* 8 bytes. packet FIXED for the next century! */
+
+/*
+ * short commands, packets without payload, plain p_header:
+ *   P_PING
+ *   P_PING_ACK
+ *   P_BECOME_SYNC_TARGET
+ *   P_BECOME_SYNC_SOURCE
+ *   P_UNPLUG_REMOTE
+ */
+
+/*
+ * commands with out-of-struct payload:
+ *   P_BITMAP    (no additional fields)
+ *   P_DATA, P_DATA_REPLY (see p_data)
+ *   P_COMPRESSED_BITMAP (see receive_compressed_bitmap)
+ */
+
+/* these defines must not be changed without changing the protocol version */
+#define DP_HARDBARRIER	      1
+#define DP_RW_SYNC	      2
+#define DP_MAY_SET_IN_SYNC    4
+
+struct p_data {
+	struct p_header head;
+	u64	    sector;    /* 64 bits sector number */
+	u64	    block_id;  /* to identify the request in protocol B&C */
+	u32	    seq_num;
+	u32	    dp_flags;
+} __packed;
+
+/*
+ * commands which share a struct:
+ *  p_block_ack:
+ *   P_RECV_ACK (proto B), P_WRITE_ACK (proto C),
+ *   P_DISCARD_ACK (proto C, two-primaries conflict detection)
+ *  p_block_req:
+ *   P_DATA_REQUEST, P_RS_DATA_REQUEST
+ */
+struct p_block_ack {
+	struct p_header head;
+	u64	    sector;
+	u64	    block_id;
+	u32	    blksize;
+	u32	    seq_num;
+} __packed;
+
+
+struct p_block_req {
+	struct p_header head;
+	u64 sector;
+	u64 block_id;
+	u32 blksize;
+	u32 pad;	/* to multiple of 8 Byte */
+} __packed;
+
+/*
+ * commands with their own struct for additional fields:
+ *   P_HAND_SHAKE
+ *   P_BARRIER
+ *   P_BARRIER_ACK
+ *   P_SYNC_PARAM
+ *   ReportParams
+ */
+
+struct p_handshake {
+	struct p_header head;	/* 8 bytes */
+	u32 protocol_min;
+	u32 feature_flags;
+	u32 protocol_max;
+
+	/* should be more than enough for future enhancements
+	 * for now, feature_flags and the reserverd array shall be zero.
+	 */
+
+	u32 _pad;
+	u64 reserverd[7];
+} __packed;
+/* 80 bytes, FIXED for the next century */
+
+struct p_barrier {
+	struct p_header head;
+	u32 barrier;	/* barrier number _handle_ only */
+	u32 pad;	/* to multiple of 8 Byte */
+} __packed;
+
+struct p_barrier_ack {
+	struct p_header head;
+	u32 barrier;
+	u32 set_size;
+} __packed;
+
+struct p_rs_param {
+	struct p_header head;
+	u32 rate;
+
+	      /* Since protocol version 88 and higher. */
+	char verify_alg[0];
+} __packed;
+
+struct p_rs_param_89 {
+	struct p_header head;
+	u32 rate;
+        /* protocol version 89: */
+	char verify_alg[SHARED_SECRET_MAX];
+	char csums_alg[SHARED_SECRET_MAX];
+} __packed;
+
+struct p_protocol {
+	struct p_header head;
+	u32 protocol;
+	u32 after_sb_0p;
+	u32 after_sb_1p;
+	u32 after_sb_2p;
+	u32 want_lose;
+	u32 two_primaries;
+
+              /* Since protocol version 87 and higher. */
+	char integrity_alg[0];
+
+} __packed;
+
+struct p_uuids {
+	struct p_header head;
+	u64 uuid[UI_EXTENDED_SIZE];
+} __packed;
+
+struct p_rs_uuid {
+	struct p_header head;
+	u64	    uuid;
+} __packed;
+
+struct p_sizes {
+	struct p_header head;
+	u64	    d_size;  /* size of disk */
+	u64	    u_size;  /* user requested size */
+	u64	    c_size;  /* current exported size */
+	u32	    max_segment_size;  /* Maximal size of a BIO */
+	u32	    queue_order_type;
+} __packed;
+
+struct p_state {
+	struct p_header head;
+	u32	    state;
+} __packed;
+
+struct p_req_state {
+	struct p_header head;
+	u32	    mask;
+	u32	    val;
+} __packed;
+
+struct p_req_state_reply {
+	struct p_header head;
+	u32	    retcode;
+} __packed;
+
+struct p_drbd06_param {
+	u64	  size;
+	u32	  state;
+	u32	  blksize;
+	u32	  protocol;
+	u32	  version;
+	u32	  gen_cnt[5];
+	u32	  bit_map_gen[5];
+} __packed;
+
+struct p_discard {
+	struct p_header head;
+	u64	    block_id;
+	u32	    seq_num;
+	u32	    pad;
+} __packed;
+
+/* Valid values for the encoding field.
+ * Bump proto version when changing this. */
+enum drbd_bitmap_code {
+	/* RLE_VLI_Bytes = 0,
+	 * and other bit variants had been defined during
+	 * algorithm evaluation. */
+	RLE_VLI_Bits = 2,
+};
+
+struct p_compressed_bm {
+	struct p_header head;
+	/* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
+	 * (encoding & 0x80): polarity (set/unset) of first runlength
+	 * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
+	 * used to pad up to head.length bytes
+	 */
+	u8 encoding;
+
+	u8 code[0];
+} __packed;
+
+/* DCBP: Drbd Compressed Bitmap Packet ... */
+static inline enum drbd_bitmap_code
+DCBP_get_code(struct p_compressed_bm *p)
+{
+	return (enum drbd_bitmap_code)(p->encoding & 0x0f);
+}
+
+static inline void
+DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
+{
+	BUG_ON(code & ~0xf);
+	p->encoding = (p->encoding & ~0xf) | code;
+}
+
+static inline int
+DCBP_get_start(struct p_compressed_bm *p)
+{
+	return (p->encoding & 0x80) != 0;
+}
+
+static inline void
+DCBP_set_start(struct p_compressed_bm *p, int set)
+{
+	p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
+}
+
+static inline int
+DCBP_get_pad_bits(struct p_compressed_bm *p)
+{
+	return (p->encoding >> 4) & 0x7;
+}
+
+static inline void
+DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
+{
+	BUG_ON(n & ~0x7);
+	p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
+}
+
+/* one bitmap packet, including the p_header,
+ * should fit within one _architecture independend_ page.
+ * so we need to use the fixed size 4KiB page size
+ * most architechtures have used for a long time.
+ */
+#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header))
+#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long))
+#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm))
+#if (PAGE_SIZE < 4096)
+/* drbd_send_bitmap / receive_bitmap would break horribly */
+#error "PAGE_SIZE too small"
+#endif
+
+union p_polymorph {
+        struct p_header          header;
+        struct p_handshake       handshake;
+        struct p_data            data;
+        struct p_block_ack       block_ack;
+        struct p_barrier         barrier;
+        struct p_barrier_ack     barrier_ack;
+        struct p_rs_param_89     rs_param_89;
+        struct p_protocol        protocol;
+        struct p_sizes           sizes;
+        struct p_uuids           uuids;
+        struct p_state           state;
+        struct p_req_state       req_state;
+        struct p_req_state_reply req_state_reply;
+        struct p_block_req       block_req;
+} __packed;
+
+/**********************************************************************/
+enum drbd_thread_state {
+	None,
+	Running,
+	Exiting,
+	Restarting
+};
+
+struct drbd_thread {
+	spinlock_t t_lock;
+	struct task_struct *task;
+	struct completion stop;
+	enum drbd_thread_state t_state;
+	int (*function) (struct drbd_thread *);
+	struct drbd_conf *mdev;
+	int reset_cpu_mask;
+};
+
+static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
+{
+	/* THINK testing the t_state seems to be uncritical in all cases
+	 * (but thread_{start,stop}), so we can read it *without* the lock.
+	 *	--lge */
+
+	smp_rmb();
+	return thi->t_state;
+}
+
+
+/*
+ * Having this as the first member of a struct provides sort of "inheritance".
+ * "derived" structs can be "drbd_queue_work()"ed.
+ * The callback should know and cast back to the descendant struct.
+ * drbd_request and drbd_epoch_entry are descendants of drbd_work.
+ */
+struct drbd_work;
+typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel);
+struct drbd_work {
+	struct list_head list;
+	drbd_work_cb cb;
+};
+
+struct drbd_tl_epoch;
+struct drbd_request {
+	struct drbd_work w;
+	struct drbd_conf *mdev;
+
+	/* if local IO is not allowed, will be NULL.
+	 * if local IO _is_ allowed, holds the locally submitted bio clone,
+	 * or, after local IO completion, the ERR_PTR(error).
+	 * see drbd_endio_pri(). */
+	struct bio *private_bio;
+
+	struct hlist_node colision;
+	sector_t sector;
+	unsigned int size;
+	unsigned int epoch; /* barrier_nr */
+
+	/* barrier_nr: used to check on "completion" whether this req was in
+	 * the current epoch, and we therefore have to close it,
+	 * starting a new epoch...
+	 */
+
+	/* up to here, the struct layout is identical to drbd_epoch_entry;
+	 * we might be able to use that to our advantage...  */
+
+	struct list_head tl_requests; /* ring list in the transfer log */
+	struct bio *master_bio;       /* master bio pointer */
+	unsigned long rq_state; /* see comments above _req_mod() */
+	int seq_num;
+	unsigned long start_time;
+};
+
+struct drbd_tl_epoch {
+	struct drbd_work w;
+	struct list_head requests; /* requests before */
+	struct drbd_tl_epoch *next; /* pointer to the next barrier */
+	unsigned int br_number;  /* the barriers identifier. */
+	int n_req;	/* number of requests attached before this barrier */
+};
+
+struct drbd_request;
+
+/* These Tl_epoch_entries may be in one of 6 lists:
+   active_ee .. data packet being written
+   sync_ee   .. syncer block being written
+   done_ee   .. block written, need to send P_WRITE_ACK
+   read_ee   .. [RS]P_DATA_REQUEST being read
+*/
+
+struct drbd_epoch {
+	struct list_head list;
+	unsigned int barrier_nr;
+	atomic_t epoch_size; /* increased on every request added. */
+	atomic_t active;     /* increased on every req. added, and dec on every finished. */
+	unsigned long flags;
+};
+
+/* drbd_epoch flag bits */
+enum {
+	DE_BARRIER_IN_NEXT_EPOCH_ISSUED,
+	DE_BARRIER_IN_NEXT_EPOCH_DONE,
+	DE_CONTAINS_A_BARRIER,
+	DE_HAVE_BARRIER_NUMBER,
+	DE_IS_FINISHING,
+};
+
+enum epoch_event {
+	EV_PUT,
+	EV_GOT_BARRIER_NR,
+	EV_BARRIER_DONE,
+	EV_BECAME_LAST,
+	EV_TRACE_FLUSH,       /* TRACE_ are not real events, only used for tracing */
+	EV_TRACE_ADD_BARRIER, /* Doing the first write as a barrier write */
+	EV_TRACE_SETTING_BI,  /* Barrier is expressed with the first write of the next epoch */
+	EV_TRACE_ALLOC,
+	EV_TRACE_FREE,
+	EV_CLEANUP = 32, /* used as flag */
+};
+
+struct drbd_epoch_entry {
+	struct drbd_work    w;
+	struct drbd_conf *mdev;
+	struct bio *private_bio;
+	struct hlist_node colision;
+	sector_t sector;
+	unsigned int size;
+	struct drbd_epoch *epoch;
+
+	/* up to here, the struct layout is identical to drbd_request;
+	 * we might be able to use that to our advantage...  */
+
+	unsigned int flags;
+	u64    block_id;
+};
+
+struct drbd_wq_barrier {
+	struct drbd_work w;
+	struct completion done;
+};
+
+struct digest_info {
+	int digest_size;
+	void *digest;
+};
+
+/* ee flag bits */
+enum {
+	__EE_CALL_AL_COMPLETE_IO,
+	__EE_CONFLICT_PENDING,
+	__EE_MAY_SET_IN_SYNC,
+	__EE_IS_BARRIER,
+};
+#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
+#define EE_CONFLICT_PENDING    (1<<__EE_CONFLICT_PENDING)
+#define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
+#define EE_IS_BARRIER          (1<<__EE_IS_BARRIER)
+
+/* global flag bits */
+enum {
+	CREATE_BARRIER,		/* next P_DATA is preceeded by a P_BARRIER */
+	SIGNAL_ASENDER,		/* whether asender wants to be interrupted */
+	SEND_PING,		/* whether asender should send a ping asap */
+
+	STOP_SYNC_TIMER,	/* tell timer to cancel itself */
+	UNPLUG_QUEUED,		/* only relevant with kernel 2.4 */
+	UNPLUG_REMOTE,		/* sending a "UnplugRemote" could help */
+	MD_DIRTY,		/* current uuids and flags not yet on disk */
+	DISCARD_CONCURRENT,	/* Set on one node, cleared on the peer! */
+	USE_DEGR_WFC_T,		/* degr-wfc-timeout instead of wfc-timeout. */
+	CLUSTER_ST_CHANGE,	/* Cluster wide state change going on... */
+	CL_ST_CHG_SUCCESS,
+	CL_ST_CHG_FAIL,
+	CRASHED_PRIMARY,	/* This node was a crashed primary.
+				 * Gets cleared when the state.conn
+				 * goes into C_CONNECTED state. */
+	WRITE_BM_AFTER_RESYNC,	/* A kmalloc() during resync failed */
+	NO_BARRIER_SUPP,	/* underlying block device doesn't implement barriers */
+	CONSIDER_RESYNC,
+
+	MD_NO_BARRIER,		/* meta data device does not support barriers,
+				   so don't even try */
+	SUSPEND_IO,		/* suspend application io */
+	BITMAP_IO,		/* suspend application io;
+				   once no more io in flight, start bitmap io */
+	BITMAP_IO_QUEUED,       /* Started bitmap IO */
+	RESYNC_AFTER_NEG,       /* Resync after online grow after the attach&negotiate finished. */
+	NET_CONGESTED,		/* The data socket is congested */
+
+	CONFIG_PENDING,		/* serialization of (re)configuration requests.
+				 * if set, also prevents the device from dying */
+	DEVICE_DYING,		/* device became unconfigured,
+				 * but worker thread is still handling the cleanup.
+				 * reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed,
+				 * while this is set. */
+	RESIZE_PENDING,		/* Size change detected locally, waiting for the response from
+				 * the peer, if it changed there as well. */
+};
+
+struct drbd_bitmap; /* opaque for drbd_conf */
+
+/* TODO sort members for performance
+ * MAYBE group them further */
+
+/* THINK maybe we actually want to use the default "event/%s" worker threads
+ * or similar in linux 2.6, which uses per cpu data and threads.
+ *
+ * To be general, this might need a spin_lock member.
+ * For now, please use the mdev->req_lock to protect list_head,
+ * see drbd_queue_work below.
+ */
+struct drbd_work_queue {
+	struct list_head q;
+	struct semaphore s; /* producers up it, worker down()s it */
+	spinlock_t q_lock;  /* to protect the list. */
+};
+
+struct drbd_socket {
+	struct drbd_work_queue work;
+	struct mutex mutex;
+	struct socket    *socket;
+	/* this way we get our
+	 * send/receive buffers off the stack */
+	union p_polymorph sbuf;
+	union p_polymorph rbuf;
+};
+
+struct drbd_md {
+	u64 md_offset;		/* sector offset to 'super' block */
+
+	u64 la_size_sect;	/* last agreed size, unit sectors */
+	u64 uuid[UI_SIZE];
+	u64 device_uuid;
+	u32 flags;
+	u32 md_size_sect;
+
+	s32 al_offset;	/* signed relative sector offset to al area */
+	s32 bm_offset;	/* signed relative sector offset to bitmap */
+
+	/* u32 al_nr_extents;	   important for restoring the AL
+	 * is stored into  sync_conf.al_extents, which in turn
+	 * gets applied to act_log->nr_elements
+	 */
+};
+
+/* for sync_conf and other types... */
+#define NL_PACKET(name, number, fields) struct name { fields };
+#define NL_INTEGER(pn,pr,member) int member;
+#define NL_INT64(pn,pr,member) __u64 member;
+#define NL_BIT(pn,pr,member)   unsigned member:1;
+#define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len;
+#include "linux/drbd_nl.h"
+
+struct drbd_backing_dev {
+	struct block_device *backing_bdev;
+	struct block_device *md_bdev;
+	struct file *lo_file;
+	struct file *md_file;
+	struct drbd_md md;
+	struct disk_conf dc; /* The user provided config... */
+	sector_t known_size; /* last known size of that backing device */
+};
+
+struct drbd_md_io {
+	struct drbd_conf *mdev;
+	struct completion event;
+	int error;
+};
+
+struct bm_io_work {
+	struct drbd_work w;
+	char *why;
+	int (*io_fn)(struct drbd_conf *mdev);
+	void (*done)(struct drbd_conf *mdev, int rv);
+};
+
+enum write_ordering_e {
+	WO_none,
+	WO_drain_io,
+	WO_bdev_flush,
+	WO_bio_barrier
+};
+
+struct drbd_conf {
+	/* things that are stored as / read from meta data on disk */
+	unsigned long flags;
+
+	/* configured by drbdsetup */
+	struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */
+	struct syncer_conf sync_conf;
+	struct drbd_backing_dev *ldev __protected_by(local);
+
+	sector_t p_size;     /* partner's disk size */
+	struct request_queue *rq_queue;
+	struct block_device *this_bdev;
+	struct gendisk	    *vdisk;
+
+	struct drbd_socket data; /* data/barrier/cstate/parameter packets */
+	struct drbd_socket meta; /* ping/ack (metadata) packets */
+	int agreed_pro_version;  /* actually used protocol version */
+	unsigned long last_received; /* in jiffies, either socket */
+	unsigned int ko_count;
+	struct drbd_work  resync_work,
+			  unplug_work,
+			  md_sync_work;
+	struct timer_list resync_timer;
+	struct timer_list md_sync_timer;
+
+	/* Used after attach while negotiating new disk state. */
+	union drbd_state new_state_tmp;
+
+	union drbd_state state;
+	wait_queue_head_t misc_wait;
+	wait_queue_head_t state_wait;  /* upon each state change. */
+	unsigned int send_cnt;
+	unsigned int recv_cnt;
+	unsigned int read_cnt;
+	unsigned int writ_cnt;
+	unsigned int al_writ_cnt;
+	unsigned int bm_writ_cnt;
+	atomic_t ap_bio_cnt;	 /* Requests we need to complete */
+	atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
+	atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
+	atomic_t unacked_cnt;	 /* Need to send replys for */
+	atomic_t local_cnt;	 /* Waiting for local completion */
+	atomic_t net_cnt;	 /* Users of net_conf */
+	spinlock_t req_lock;
+	struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */
+	struct drbd_tl_epoch *newest_tle;
+	struct drbd_tl_epoch *oldest_tle;
+	struct list_head out_of_sequence_requests;
+	struct hlist_head *tl_hash;
+	unsigned int tl_hash_s;
+
+	/* blocks to sync in this run [unit BM_BLOCK_SIZE] */
+	unsigned long rs_total;
+	/* number of sync IOs that failed in this run */
+	unsigned long rs_failed;
+	/* Syncer's start time [unit jiffies] */
+	unsigned long rs_start;
+	/* cumulated time in PausedSyncX state [unit jiffies] */
+	unsigned long rs_paused;
+	/* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
+	unsigned long rs_mark_left;
+	/* marks's time [unit jiffies] */
+	unsigned long rs_mark_time;
+	/* skipped because csum was equeal [unit BM_BLOCK_SIZE] */
+	unsigned long rs_same_csum;
+
+	/* where does the admin want us to start? (sector) */
+	sector_t ov_start_sector;
+	/* where are we now? (sector) */
+	sector_t ov_position;
+	/* Start sector of out of sync range (to merge printk reporting). */
+	sector_t ov_last_oos_start;
+	/* size of out-of-sync range in sectors. */
+	sector_t ov_last_oos_size;
+	unsigned long ov_left; /* in bits */
+	struct crypto_hash *csums_tfm;
+	struct crypto_hash *verify_tfm;
+
+	struct drbd_thread receiver;
+	struct drbd_thread worker;
+	struct drbd_thread asender;
+	struct drbd_bitmap *bitmap;
+	unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */
+
+	/* Used to track operations of resync... */
+	struct lru_cache *resync;
+	/* Number of locked elements in resync LRU */
+	unsigned int resync_locked;
+	/* resync extent number waiting for application requests */
+	unsigned int resync_wenr;
+
+	int open_cnt;
+	u64 *p_uuid;
+	struct drbd_epoch *current_epoch;
+	spinlock_t epoch_lock;
+	unsigned int epochs;
+	enum write_ordering_e write_ordering;
+	struct list_head active_ee; /* IO in progress */
+	struct list_head sync_ee;   /* IO in progress */
+	struct list_head done_ee;   /* send ack */
+	struct list_head read_ee;   /* IO in progress */
+	struct list_head net_ee;    /* zero-copy network send in progress */
+	struct hlist_head *ee_hash; /* is proteced by req_lock! */
+	unsigned int ee_hash_s;
+
+	/* this one is protected by ee_lock, single thread */
+	struct drbd_epoch_entry *last_write_w_barrier;
+
+	int next_barrier_nr;
+	struct hlist_head *app_reads_hash; /* is proteced by req_lock */
+	struct list_head resync_reads;
+	atomic_t pp_in_use;
+	wait_queue_head_t ee_wait;
+	struct page *md_io_page;	/* one page buffer for md_io */
+	struct page *md_io_tmpp;	/* for logical_block_size != 512 */
+	struct mutex md_io_mutex;	/* protects the md_io_buffer */
+	spinlock_t al_lock;
+	wait_queue_head_t al_wait;
+	struct lru_cache *act_log;	/* activity log */
+	unsigned int al_tr_number;
+	int al_tr_cycle;
+	int al_tr_pos;   /* position of the next transaction in the journal */
+	struct crypto_hash *cram_hmac_tfm;
+	struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */
+	struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */
+	void *int_dig_out;
+	void *int_dig_in;
+	void *int_dig_vv;
+	wait_queue_head_t seq_wait;
+	atomic_t packet_seq;
+	unsigned int peer_seq;
+	spinlock_t peer_seq_lock;
+	unsigned int minor;
+	unsigned long comm_bm_set; /* communicated number of set bits. */
+	cpumask_var_t cpu_mask;
+	struct bm_io_work bm_io_work;
+	u64 ed_uuid; /* UUID of the exposed data */
+	struct mutex state_mutex;
+	char congestion_reason;  /* Why we where congested... */
+};
+
+static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
+{
+	struct drbd_conf *mdev;
+
+	mdev = minor < minor_count ? minor_table[minor] : NULL;
+
+	return mdev;
+}
+
+static inline unsigned int mdev_to_minor(struct drbd_conf *mdev)
+{
+	return mdev->minor;
+}
+
+/* returns 1 if it was successfull,
+ * returns 0 if there was no data socket.
+ * so wherever you are going to use the data.socket, e.g. do
+ * if (!drbd_get_data_sock(mdev))
+ *	return 0;
+ *	CODE();
+ * drbd_put_data_sock(mdev);
+ */
+static inline int drbd_get_data_sock(struct drbd_conf *mdev)
+{
+	mutex_lock(&mdev->data.mutex);
+	/* drbd_disconnect() could have called drbd_free_sock()
+	 * while we were waiting in down()... */
+	if (unlikely(mdev->data.socket == NULL)) {
+		mutex_unlock(&mdev->data.mutex);
+		return 0;
+	}
+	return 1;
+}
+
+static inline void drbd_put_data_sock(struct drbd_conf *mdev)
+{
+	mutex_unlock(&mdev->data.mutex);
+}
+
+/*
+ * function declarations
+ *************************/
+
+/* drbd_main.c */
+
+enum chg_state_flags {
+	CS_HARD	= 1,
+	CS_VERBOSE = 2,
+	CS_WAIT_COMPLETE = 4,
+	CS_SERIALIZE    = 8,
+	CS_ORDERED      = CS_WAIT_COMPLETE + CS_SERIALIZE,
+};
+
+extern void drbd_init_set_defaults(struct drbd_conf *mdev);
+extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
+			union drbd_state mask, union drbd_state val);
+extern void drbd_force_state(struct drbd_conf *, union drbd_state,
+			union drbd_state);
+extern int _drbd_request_state(struct drbd_conf *, union drbd_state,
+			union drbd_state, enum chg_state_flags);
+extern int __drbd_set_state(struct drbd_conf *, union drbd_state,
+			    enum chg_state_flags, struct completion *done);
+extern void print_st_err(struct drbd_conf *, union drbd_state,
+			union drbd_state, int);
+extern int  drbd_thread_start(struct drbd_thread *thi);
+extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait);
+#ifdef CONFIG_SMP
+extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev);
+extern void drbd_calc_cpu_mask(struct drbd_conf *mdev);
+#else
+#define drbd_thread_current_set_cpu(A) ({})
+#define drbd_calc_cpu_mask(A) ({})
+#endif
+extern void drbd_free_resources(struct drbd_conf *mdev);
+extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
+		       unsigned int set_size);
+extern void tl_clear(struct drbd_conf *mdev);
+extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
+extern void drbd_free_sock(struct drbd_conf *mdev);
+extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
+			void *buf, size_t size, unsigned msg_flags);
+extern int drbd_send_protocol(struct drbd_conf *mdev);
+extern int drbd_send_uuids(struct drbd_conf *mdev);
+extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
+extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val);
+extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply);
+extern int _drbd_send_state(struct drbd_conf *mdev);
+extern int drbd_send_state(struct drbd_conf *mdev);
+extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
+			enum drbd_packets cmd, struct p_header *h,
+			size_t size, unsigned msg_flags);
+#define USE_DATA_SOCKET 1
+#define USE_META_SOCKET 0
+extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
+			enum drbd_packets cmd, struct p_header *h,
+			size_t size);
+extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd,
+			char *data, size_t size);
+extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc);
+extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr,
+			u32 set_size);
+extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
+			struct drbd_epoch_entry *e);
+extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
+			struct p_block_req *rp);
+extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
+			struct p_data *dp);
+extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
+			    sector_t sector, int blksize, u64 block_id);
+extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
+			   struct drbd_epoch_entry *e);
+extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req);
+extern int _drbd_send_barrier(struct drbd_conf *mdev,
+			struct drbd_tl_epoch *barrier);
+extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
+			      sector_t sector, int size, u64 block_id);
+extern int drbd_send_drequest_csum(struct drbd_conf *mdev,
+				   sector_t sector,int size,
+				   void *digest, int digest_size,
+				   enum drbd_packets cmd);
+extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size);
+
+extern int drbd_send_bitmap(struct drbd_conf *mdev);
+extern int _drbd_send_bitmap(struct drbd_conf *mdev);
+extern int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode);
+extern void drbd_free_bc(struct drbd_backing_dev *ldev);
+extern void drbd_mdev_cleanup(struct drbd_conf *mdev);
+
+/* drbd_meta-data.c (still in drbd_main.c) */
+extern void drbd_md_sync(struct drbd_conf *mdev);
+extern int  drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev);
+/* maybe define them below as inline? */
+extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
+extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
+extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
+extern void _drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
+extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local);
+extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local);
+extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local);
+extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
+extern void drbd_md_mark_dirty(struct drbd_conf *mdev);
+extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
+				 int (*io_fn)(struct drbd_conf *),
+				 void (*done)(struct drbd_conf *, int),
+				 char *why);
+extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
+extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
+extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why);
+
+
+/* Meta data layout
+   We reserve a 128MB Block (4k aligned)
+   * either at the end of the backing device
+   * or on a seperate meta data device. */
+
+#define MD_RESERVED_SECT (128LU << 11)  /* 128 MB, unit sectors */
+/* The following numbers are sectors */
+#define MD_AL_OFFSET 8	    /* 8 Sectors after start of meta area */
+#define MD_AL_MAX_SIZE 64   /* = 32 kb LOG  ~ 3776 extents ~ 14 GB Storage */
+/* Allows up to about 3.8TB */
+#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE)
+
+/* Since the smalles IO unit is usually 512 byte */
+#define MD_SECTOR_SHIFT	 9
+#define MD_SECTOR_SIZE	 (1<<MD_SECTOR_SHIFT)
+
+/* activity log */
+#define AL_EXTENTS_PT ((MD_SECTOR_SIZE-12)/8-1) /* 61 ; Extents per 512B sector */
+#define AL_EXTENT_SHIFT 22		 /* One extent represents 4M Storage */
+#define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT)
+
+#if BITS_PER_LONG == 32
+#define LN2_BPL 5
+#define cpu_to_lel(A) cpu_to_le32(A)
+#define lel_to_cpu(A) le32_to_cpu(A)
+#elif BITS_PER_LONG == 64
+#define LN2_BPL 6
+#define cpu_to_lel(A) cpu_to_le64(A)
+#define lel_to_cpu(A) le64_to_cpu(A)
+#else
+#error "LN2 of BITS_PER_LONG unknown!"
+#endif
+
+/* resync bitmap */
+/* 16MB sized 'bitmap extent' to track syncer usage */
+struct bm_extent {
+	int rs_left; /* number of bits set (out of sync) in this extent. */
+	int rs_failed; /* number of failed resync requests in this extent. */
+	unsigned long flags;
+	struct lc_element lce;
+};
+
+#define BME_NO_WRITES  0  /* bm_extent.flags: no more requests on this one! */
+#define BME_LOCKED     1  /* bm_extent.flags: syncer active on this one. */
+
+/* drbd_bitmap.c */
+/*
+ * We need to store one bit for a block.
+ * Example: 1GB disk @ 4096 byte blocks ==> we need 32 KB bitmap.
+ * Bit 0 ==> local node thinks this block is binary identical on both nodes
+ * Bit 1 ==> local node thinks this block needs to be synced.
+ */
+
+#define BM_BLOCK_SHIFT  12			 /* 4k per bit */
+#define BM_BLOCK_SIZE	 (1<<BM_BLOCK_SHIFT)
+/* (9+3) : 512 bytes @ 8 bits; representing 16M storage
+ * per sector of on disk bitmap */
+#define BM_EXT_SHIFT	 (BM_BLOCK_SHIFT + MD_SECTOR_SHIFT + 3)  /* = 24 */
+#define BM_EXT_SIZE	 (1<<BM_EXT_SHIFT)
+
+#if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12)
+#error "HAVE YOU FIXED drbdmeta AS WELL??"
+#endif
+
+/* thus many _storage_ sectors are described by one bit */
+#define BM_SECT_TO_BIT(x)   ((x)>>(BM_BLOCK_SHIFT-9))
+#define BM_BIT_TO_SECT(x)   ((sector_t)(x)<<(BM_BLOCK_SHIFT-9))
+#define BM_SECT_PER_BIT     BM_BIT_TO_SECT(1)
+
+/* bit to represented kilo byte conversion */
+#define Bit2KB(bits) ((bits)<<(BM_BLOCK_SHIFT-10))
+
+/* in which _bitmap_ extent (resp. sector) the bit for a certain
+ * _storage_ sector is located in */
+#define BM_SECT_TO_EXT(x)   ((x)>>(BM_EXT_SHIFT-9))
+
+/* how much _storage_ sectors we have per bitmap sector */
+#define BM_EXT_TO_SECT(x)   ((sector_t)(x) << (BM_EXT_SHIFT-9))
+#define BM_SECT_PER_EXT     BM_EXT_TO_SECT(1)
+
+/* in one sector of the bitmap, we have this many activity_log extents. */
+#define AL_EXT_PER_BM_SECT  (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
+#define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
+
+#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT)
+#define BM_BLOCKS_PER_BM_EXT_MASK  ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1)
+
+/* the extent in "PER_EXTENT" below is an activity log extent
+ * we need that many (long words/bytes) to store the bitmap
+ *		     of one AL_EXTENT_SIZE chunk of storage.
+ * we can store the bitmap for that many AL_EXTENTS within
+ * one sector of the _on_disk_ bitmap:
+ * bit	 0	  bit 37   bit 38	     bit (512*8)-1
+ *	     ...|........|........|.. // ..|........|
+ * sect. 0	 `296	  `304			   ^(512*8*8)-1
+ *
+#define BM_WORDS_PER_EXT    ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG )
+#define BM_BYTES_PER_EXT    ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 )  // 128
+#define BM_EXT_PER_SECT	    ( 512 / BM_BYTES_PER_EXTENT )	 //   4
+ */
+
+#define DRBD_MAX_SECTORS_32 (0xffffffffLU)
+#define DRBD_MAX_SECTORS_BM \
+	  ((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9)))
+#if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32
+#define DRBD_MAX_SECTORS      DRBD_MAX_SECTORS_BM
+#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM
+#elif !defined(CONFIG_LBD) && BITS_PER_LONG == 32
+#define DRBD_MAX_SECTORS      DRBD_MAX_SECTORS_32
+#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32
+#else
+#define DRBD_MAX_SECTORS      DRBD_MAX_SECTORS_BM
+/* 16 TB in units of sectors */
+#if BITS_PER_LONG == 32
+/* adjust by one page worth of bitmap,
+ * so we won't wrap around in drbd_bm_find_next_bit.
+ * you should use 64bit OS for that much storage, anyways. */
+#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff)
+#else
+#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0x1LU << 32)
+#endif
+#endif
+
+/* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
+ * With a value of 6 all IO in one 32K block make it to the same slot of the
+ * hash table. */
+#define HT_SHIFT 6
+#define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT))
+
+/* Number of elements in the app_reads_hash */
+#define APP_R_HSIZE 15
+
+extern int  drbd_bm_init(struct drbd_conf *mdev);
+extern int  drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors);
+extern void drbd_bm_cleanup(struct drbd_conf *mdev);
+extern void drbd_bm_set_all(struct drbd_conf *mdev);
+extern void drbd_bm_clear_all(struct drbd_conf *mdev);
+extern int  drbd_bm_set_bits(
+		struct drbd_conf *mdev, unsigned long s, unsigned long e);
+extern int  drbd_bm_clear_bits(
+		struct drbd_conf *mdev, unsigned long s, unsigned long e);
+/* bm_set_bits variant for use while holding drbd_bm_lock */
+extern void _drbd_bm_set_bits(struct drbd_conf *mdev,
+		const unsigned long s, const unsigned long e);
+extern int  drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr);
+extern int  drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
+extern int  drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local);
+extern int  drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
+extern int  drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
+extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
+		unsigned long al_enr);
+extern size_t	     drbd_bm_words(struct drbd_conf *mdev);
+extern unsigned long drbd_bm_bits(struct drbd_conf *mdev);
+extern sector_t      drbd_bm_capacity(struct drbd_conf *mdev);
+extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
+/* bm_find_next variants for use while you hold drbd_bm_lock() */
+extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
+extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo);
+extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev);
+extern int drbd_bm_rs_done(struct drbd_conf *mdev);
+/* for receive_bitmap */
+extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset,
+		size_t number, unsigned long *buffer);
+/* for _drbd_send_bitmap and drbd_bm_write_sect */
+extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset,
+		size_t number, unsigned long *buffer);
+
+extern void drbd_bm_lock(struct drbd_conf *mdev, char *why);
+extern void drbd_bm_unlock(struct drbd_conf *mdev);
+
+extern int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e);
+/* drbd_main.c */
+
+extern struct kmem_cache *drbd_request_cache;
+extern struct kmem_cache *drbd_ee_cache;	/* epoch entries */
+extern struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
+extern struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
+extern mempool_t *drbd_request_mempool;
+extern mempool_t *drbd_ee_mempool;
+
+extern struct page *drbd_pp_pool; /* drbd's page pool */
+extern spinlock_t   drbd_pp_lock;
+extern int	    drbd_pp_vacant;
+extern wait_queue_head_t drbd_pp_wait;
+
+extern rwlock_t global_state_lock;
+
+extern struct drbd_conf *drbd_new_device(unsigned int minor);
+extern void drbd_free_mdev(struct drbd_conf *mdev);
+
+extern int proc_details;
+
+/* drbd_req */
+extern int drbd_make_request_26(struct request_queue *q, struct bio *bio);
+extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
+extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
+extern int is_valid_ar_handle(struct drbd_request *, sector_t);
+
+
+/* drbd_nl.c */
+extern void drbd_suspend_io(struct drbd_conf *mdev);
+extern void drbd_resume_io(struct drbd_conf *mdev);
+extern char *ppsize(char *buf, unsigned long long size);
+extern sector_t drbd_new_dev_size(struct drbd_conf *,
+		struct drbd_backing_dev *);
+enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
+extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *) __must_hold(local);
+extern void resync_after_online_grow(struct drbd_conf *);
+extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
+extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role,
+		int force);
+enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
+extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
+
+/* drbd_worker.c */
+extern int drbd_worker(struct drbd_thread *thi);
+extern int drbd_alter_sa(struct drbd_conf *mdev, int na);
+extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side);
+extern void resume_next_sg(struct drbd_conf *mdev);
+extern void suspend_other_sg(struct drbd_conf *mdev);
+extern int drbd_resync_finished(struct drbd_conf *mdev);
+/* maybe rather drbd_main.c ? */
+extern int drbd_md_sync_page_io(struct drbd_conf *mdev,
+		struct drbd_backing_dev *bdev, sector_t sector, int rw);
+extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int);
+
+static inline void ov_oos_print(struct drbd_conf *mdev)
+{
+	if (mdev->ov_last_oos_size) {
+		dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n",
+		     (unsigned long long)mdev->ov_last_oos_start,
+		     (unsigned long)mdev->ov_last_oos_size);
+	}
+	mdev->ov_last_oos_size=0;
+}
+
+
+extern void drbd_csum(struct drbd_conf *, struct crypto_hash *, struct bio *, void *);
+/* worker callbacks */
+extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int);
+extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int);
+extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int);
+extern int w_resync_inactive(struct drbd_conf *, struct drbd_work *, int);
+extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int);
+extern int w_io_error(struct drbd_conf *, struct drbd_work *, int);
+extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int);
+extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int);
+extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int);
+extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int);
+extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
+extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
+
+extern void resync_timer_fn(unsigned long data);
+
+/* drbd_receiver.c */
+extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
+extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
+					    u64 id,
+					    sector_t sector,
+					    unsigned int data_size,
+					    gfp_t gfp_mask) __must_hold(local);
+extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e);
+extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
+		struct list_head *head);
+extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
+		struct list_head *head);
+extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled);
+extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed);
+extern void drbd_flush_workqueue(struct drbd_conf *mdev);
+
+/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to
+ * mess with get_fs/set_fs, we know we are KERNEL_DS always. */
+static inline int drbd_setsockopt(struct socket *sock, int level, int optname,
+			char __user *optval, int optlen)
+{
+	int err;
+	if (level == SOL_SOCKET)
+		err = sock_setsockopt(sock, level, optname, optval, optlen);
+	else
+		err = sock->ops->setsockopt(sock, level, optname, optval,
+					    optlen);
+	return err;
+}
+
+static inline void drbd_tcp_cork(struct socket *sock)
+{
+	int __user val = 1;
+	(void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
+			(char __user *)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_uncork(struct socket *sock)
+{
+	int __user val = 0;
+	(void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
+			(char __user *)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_nodelay(struct socket *sock)
+{
+	int __user val = 1;
+	(void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY,
+			(char __user *)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_quickack(struct socket *sock)
+{
+	int __user val = 1;
+	(void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
+			(char __user *)&val, sizeof(val));
+}
+
+void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo);
+
+/* drbd_proc.c */
+extern struct proc_dir_entry *drbd_proc;
+extern struct file_operations drbd_proc_fops;
+extern const char *drbd_conn_str(enum drbd_conns s);
+extern const char *drbd_role_str(enum drbd_role s);
+
+/* drbd_actlog.c */
+extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector);
+extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector);
+extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector);
+extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
+extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
+extern void drbd_rs_cancel_all(struct drbd_conf *mdev);
+extern int drbd_rs_del_all(struct drbd_conf *mdev);
+extern void drbd_rs_failed_io(struct drbd_conf *mdev,
+		sector_t sector, int size);
+extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *);
+extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector,
+		int size, const char *file, const unsigned int line);
+#define drbd_set_in_sync(mdev, sector, size) \
+	__drbd_set_in_sync(mdev, sector, size, __FILE__, __LINE__)
+extern void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector,
+		int size, const char *file, const unsigned int line);
+#define drbd_set_out_of_sync(mdev, sector, size) \
+	__drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__)
+extern void drbd_al_apply_to_bm(struct drbd_conf *mdev);
+extern void drbd_al_to_on_disk_bm(struct drbd_conf *mdev);
+extern void drbd_al_shrink(struct drbd_conf *mdev);
+
+
+/* drbd_nl.c */
+
+void drbd_nl_cleanup(void);
+int __init drbd_nl_init(void);
+void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state);
+void drbd_bcast_sync_progress(struct drbd_conf *mdev);
+void drbd_bcast_ee(struct drbd_conf *mdev,
+		const char *reason, const int dgs,
+		const char* seen_hash, const char* calc_hash,
+		const struct drbd_epoch_entry* e);
+
+
+/**
+ * DOC: DRBD State macros
+ *
+ * These macros are used to express state changes in easily readable form.
+ *
+ * The NS macros expand to a mask and a value, that can be bit ored onto the
+ * current state as soon as the spinlock (req_lock) was taken.
+ *
+ * The _NS macros are used for state functions that get called with the
+ * spinlock. These macros expand directly to the new state value.
+ *
+ * Besides the basic forms NS() and _NS() additional _?NS[23] are defined
+ * to express state changes that affect more than one aspect of the state.
+ *
+ * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
+ * Means that the network connection was established and that the peer
+ * is in secondary role.
+ */
+#define role_MASK R_MASK
+#define peer_MASK R_MASK
+#define disk_MASK D_MASK
+#define pdsk_MASK D_MASK
+#define conn_MASK C_MASK
+#define susp_MASK 1
+#define user_isp_MASK 1
+#define aftr_isp_MASK 1
+
+#define NS(T, S) \
+	({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
+	({ union drbd_state val; val.i = 0; val.T = (S); val; })
+#define NS2(T1, S1, T2, S2) \
+	({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
+	  mask.T2 = T2##_MASK; mask; }), \
+	({ union drbd_state val; val.i = 0; val.T1 = (S1); \
+	  val.T2 = (S2); val; })
+#define NS3(T1, S1, T2, S2, T3, S3) \
+	({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
+	  mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
+	({ union drbd_state val;  val.i = 0; val.T1 = (S1); \
+	  val.T2 = (S2); val.T3 = (S3); val; })
+
+#define _NS(D, T, S) \
+	D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T = (S); __ns; })
+#define _NS2(D, T1, S1, T2, S2) \
+	D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
+	__ns.T2 = (S2); __ns; })
+#define _NS3(D, T1, S1, T2, S2, T3, S3) \
+	D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
+	__ns.T2 = (S2); __ns.T3 = (S3); __ns; })
+
+/*
+ * inline helper functions
+ *************************/
+
+static inline void drbd_state_lock(struct drbd_conf *mdev)
+{
+	wait_event(mdev->misc_wait,
+		   !test_and_set_bit(CLUSTER_ST_CHANGE, &mdev->flags));
+}
+
+static inline void drbd_state_unlock(struct drbd_conf *mdev)
+{
+	clear_bit(CLUSTER_ST_CHANGE, &mdev->flags);
+	wake_up(&mdev->misc_wait);
+}
+
+static inline int _drbd_set_state(struct drbd_conf *mdev,
+				   union drbd_state ns, enum chg_state_flags flags,
+				   struct completion *done)
+{
+	int rv;
+
+	read_lock(&global_state_lock);
+	rv = __drbd_set_state(mdev, ns, flags, done);
+	read_unlock(&global_state_lock);
+
+	return rv;
+}
+
+/**
+ * drbd_request_state() - Reqest a state change
+ * @mdev:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ *
+ * This is the most graceful way of requesting a state change. It is verbose
+ * quite verbose in case the state change is not possible, and all those
+ * state changes are globally serialized.
+ */
+static inline int drbd_request_state(struct drbd_conf *mdev,
+				     union drbd_state mask,
+				     union drbd_state val)
+{
+	return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED);
+}
+
+#define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__)
+static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, const char *where)
+{
+	switch (mdev->ldev->dc.on_io_error) {
+	case EP_PASS_ON:
+		if (!forcedetach) {
+			if (printk_ratelimit())
+				dev_err(DEV, "Local IO failed in %s."
+					     "Passing error on...\n", where);
+			break;
+		}
+		/* NOTE fall through to detach case if forcedetach set */
+	case EP_DETACH:
+	case EP_CALL_HELPER:
+		if (mdev->state.disk > D_FAILED) {
+			_drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
+			dev_err(DEV, "Local IO failed in %s."
+				     "Detaching...\n", where);
+		}
+		break;
+	}
+}
+
+/**
+ * drbd_chk_io_error: Handle the on_io_error setting, should be called from all io completion handlers
+ * @mdev:	 DRBD device.
+ * @error:	 Error code passed to the IO completion callback
+ * @forcedetach: Force detach. I.e. the error happened while accessing the meta data
+ *
+ * See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED)
+ */
+#define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__)
+static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
+	int error, int forcedetach, const char *where)
+{
+	if (error) {
+		unsigned long flags;
+		spin_lock_irqsave(&mdev->req_lock, flags);
+		__drbd_chk_io_error_(mdev, forcedetach, where);
+		spin_unlock_irqrestore(&mdev->req_lock, flags);
+	}
+}
+
+
+/**
+ * drbd_md_first_sector() - Returns the first sector number of the meta data area
+ * @bdev:	Meta data block device.
+ *
+ * BTW, for internal meta data, this happens to be the maximum capacity
+ * we could agree upon with our peer node.
+ */
+static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
+{
+	switch (bdev->dc.meta_dev_idx) {
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		return bdev->md.md_offset + bdev->md.bm_offset;
+	case DRBD_MD_INDEX_FLEX_EXT:
+	default:
+		return bdev->md.md_offset;
+	}
+}
+
+/**
+ * drbd_md_last_sector() - Return the last sector number of the meta data area
+ * @bdev:	Meta data block device.
+ */
+static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
+{
+	switch (bdev->dc.meta_dev_idx) {
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		return bdev->md.md_offset + MD_AL_OFFSET - 1;
+	case DRBD_MD_INDEX_FLEX_EXT:
+	default:
+		return bdev->md.md_offset + bdev->md.md_size_sect;
+	}
+}
+
+/* Returns the number of 512 byte sectors of the device */
+static inline sector_t drbd_get_capacity(struct block_device *bdev)
+{
+	/* return bdev ? get_capacity(bdev->bd_disk) : 0; */
+	return bdev ? bdev->bd_inode->i_size >> 9 : 0;
+}
+
+/**
+ * drbd_get_max_capacity() - Returns the capacity we announce to out peer
+ * @bdev:	Meta data block device.
+ *
+ * returns the capacity we announce to out peer.  we clip ourselves at the
+ * various MAX_SECTORS, because if we don't, current implementation will
+ * oops sooner or later
+ */
+static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
+{
+	sector_t s;
+	switch (bdev->dc.meta_dev_idx) {
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		s = drbd_get_capacity(bdev->backing_bdev)
+			? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
+					drbd_md_first_sector(bdev))
+			: 0;
+		break;
+	case DRBD_MD_INDEX_FLEX_EXT:
+		s = min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
+				drbd_get_capacity(bdev->backing_bdev));
+		/* clip at maximum size the meta device can support */
+		s = min_t(sector_t, s,
+			BM_EXT_TO_SECT(bdev->md.md_size_sect
+				     - bdev->md.bm_offset));
+		break;
+	default:
+		s = min_t(sector_t, DRBD_MAX_SECTORS,
+				drbd_get_capacity(bdev->backing_bdev));
+	}
+	return s;
+}
+
+/**
+ * drbd_md_ss__() - Return the sector number of our meta data super block
+ * @mdev:	DRBD device.
+ * @bdev:	Meta data block device.
+ */
+static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
+				    struct drbd_backing_dev *bdev)
+{
+	switch (bdev->dc.meta_dev_idx) {
+	default: /* external, some index */
+		return MD_RESERVED_SECT * bdev->dc.meta_dev_idx;
+	case DRBD_MD_INDEX_INTERNAL:
+		/* with drbd08, internal meta data is always "flexible" */
+	case DRBD_MD_INDEX_FLEX_INT:
+		/* sizeof(struct md_on_disk_07) == 4k
+		 * position: last 4k aligned block of 4k size */
+		if (!bdev->backing_bdev) {
+			if (__ratelimit(&drbd_ratelimit_state)) {
+				dev_err(DEV, "bdev->backing_bdev==NULL\n");
+				dump_stack();
+			}
+			return 0;
+		}
+		return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL)
+			- MD_AL_OFFSET;
+	case DRBD_MD_INDEX_FLEX_EXT:
+		return 0;
+	}
+}
+
+static inline void
+_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
+{
+	list_add_tail(&w->list, &q->q);
+	up(&q->s);
+}
+
+static inline void
+drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&q->q_lock, flags);
+	list_add(&w->list, &q->q);
+	up(&q->s); /* within the spinlock,
+		      see comment near end of drbd_worker() */
+	spin_unlock_irqrestore(&q->q_lock, flags);
+}
+
+static inline void
+drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&q->q_lock, flags);
+	list_add_tail(&w->list, &q->q);
+	up(&q->s); /* within the spinlock,
+		      see comment near end of drbd_worker() */
+	spin_unlock_irqrestore(&q->q_lock, flags);
+}
+
+static inline void wake_asender(struct drbd_conf *mdev)
+{
+	if (test_bit(SIGNAL_ASENDER, &mdev->flags))
+		force_sig(DRBD_SIG, mdev->asender.task);
+}
+
+static inline void request_ping(struct drbd_conf *mdev)
+{
+	set_bit(SEND_PING, &mdev->flags);
+	wake_asender(mdev);
+}
+
+static inline int drbd_send_short_cmd(struct drbd_conf *mdev,
+	enum drbd_packets cmd)
+{
+	struct p_header h;
+	return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h));
+}
+
+static inline int drbd_send_ping(struct drbd_conf *mdev)
+{
+	struct p_header h;
+	return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h));
+}
+
+static inline int drbd_send_ping_ack(struct drbd_conf *mdev)
+{
+	struct p_header h;
+	return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h));
+}
+
+static inline void drbd_thread_stop(struct drbd_thread *thi)
+{
+	_drbd_thread_stop(thi, FALSE, TRUE);
+}
+
+static inline void drbd_thread_stop_nowait(struct drbd_thread *thi)
+{
+	_drbd_thread_stop(thi, FALSE, FALSE);
+}
+
+static inline void drbd_thread_restart_nowait(struct drbd_thread *thi)
+{
+	_drbd_thread_stop(thi, TRUE, FALSE);
+}
+
+/* counts how many answer packets packets we expect from our peer,
+ * for either explicit application requests,
+ * or implicit barrier packets as necessary.
+ * increased:
+ *  w_send_barrier
+ *  _req_mod(req, queue_for_net_write or queue_for_net_read);
+ *    it is much easier and equally valid to count what we queue for the
+ *    worker, even before it actually was queued or send.
+ *    (drbd_make_request_common; recovery path on read io-error)
+ * decreased:
+ *  got_BarrierAck (respective tl_clear, tl_clear_barrier)
+ *  _req_mod(req, data_received)
+ *     [from receive_DataReply]
+ *  _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked)
+ *     [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)]
+ *     for some reason it is NOT decreased in got_NegAck,
+ *     but in the resulting cleanup code from report_params.
+ *     we should try to remember the reason for that...
+ *  _req_mod(req, send_failed or send_canceled)
+ *  _req_mod(req, connection_lost_while_pending)
+ *     [from tl_clear_barrier]
+ */
+static inline void inc_ap_pending(struct drbd_conf *mdev)
+{
+	atomic_inc(&mdev->ap_pending_cnt);
+}
+
+#define ERR_IF_CNT_IS_NEGATIVE(which)				\
+	if (atomic_read(&mdev->which) < 0)			\
+		dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n",	\
+		    __func__ , __LINE__ ,			\
+		    atomic_read(&mdev->which))
+
+#define dec_ap_pending(mdev)	do {				\
+	typecheck(struct drbd_conf *, mdev);			\
+	if (atomic_dec_and_test(&mdev->ap_pending_cnt))		\
+		wake_up(&mdev->misc_wait);			\
+	ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0)
+
+/* counts how many resync-related answers we still expect from the peer
+ *		     increase			decrease
+ * C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY)
+ * C_SYNC_SOURCE sends P_RS_DATA_REPLY   (and expects P_WRITE_ACK whith ID_SYNCER)
+ *					   (or P_NEG_ACK with ID_SYNCER)
+ */
+static inline void inc_rs_pending(struct drbd_conf *mdev)
+{
+	atomic_inc(&mdev->rs_pending_cnt);
+}
+
+#define dec_rs_pending(mdev)	do {				\
+	typecheck(struct drbd_conf *, mdev);			\
+	atomic_dec(&mdev->rs_pending_cnt);			\
+	ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0)
+
+/* counts how many answers we still need to send to the peer.
+ * increased on
+ *  receive_Data	unless protocol A;
+ *			we need to send a P_RECV_ACK (proto B)
+ *			or P_WRITE_ACK (proto C)
+ *  receive_RSDataReply (recv_resync_read) we need to send a P_WRITE_ACK
+ *  receive_DataRequest (receive_RSDataRequest) we need to send back P_DATA
+ *  receive_Barrier_*	we need to send a P_BARRIER_ACK
+ */
+static inline void inc_unacked(struct drbd_conf *mdev)
+{
+	atomic_inc(&mdev->unacked_cnt);
+}
+
+#define dec_unacked(mdev)	do {				\
+	typecheck(struct drbd_conf *, mdev);			\
+	atomic_dec(&mdev->unacked_cnt);				\
+	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
+
+#define sub_unacked(mdev, n)	do {				\
+	typecheck(struct drbd_conf *, mdev);			\
+	atomic_sub(n, &mdev->unacked_cnt);			\
+	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
+
+
+static inline void put_net_conf(struct drbd_conf *mdev)
+{
+	if (atomic_dec_and_test(&mdev->net_cnt))
+		wake_up(&mdev->misc_wait);
+}
+
+/**
+ * get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there
+ * @mdev:	DRBD device.
+ *
+ * You have to call put_net_conf() when finished working with mdev->net_conf.
+ */
+static inline int get_net_conf(struct drbd_conf *mdev)
+{
+	int have_net_conf;
+
+	atomic_inc(&mdev->net_cnt);
+	have_net_conf = mdev->state.conn >= C_UNCONNECTED;
+	if (!have_net_conf)
+		put_net_conf(mdev);
+	return have_net_conf;
+}
+
+/**
+ * get_ldev() - Increase the ref count on mdev->ldev. Returns 0 if there is no ldev
+ * @M:		DRBD device.
+ *
+ * You have to call put_ldev() when finished working with mdev->ldev.
+ */
+#define get_ldev(M) __cond_lock(local, _get_ldev_if_state(M,D_INCONSISTENT))
+#define get_ldev_if_state(M,MINS) __cond_lock(local, _get_ldev_if_state(M,MINS))
+
+static inline void put_ldev(struct drbd_conf *mdev)
+{
+	__release(local);
+	if (atomic_dec_and_test(&mdev->local_cnt))
+		wake_up(&mdev->misc_wait);
+	D_ASSERT(atomic_read(&mdev->local_cnt) >= 0);
+}
+
+#ifndef __CHECKER__
+static inline int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
+{
+	int io_allowed;
+
+	atomic_inc(&mdev->local_cnt);
+	io_allowed = (mdev->state.disk >= mins);
+	if (!io_allowed)
+		put_ldev(mdev);
+	return io_allowed;
+}
+#else
+extern int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins);
+#endif
+
+/* you must have an "get_ldev" reference */
+static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
+		unsigned long *bits_left, unsigned int *per_mil_done)
+{
+	/*
+	 * this is to break it at compile time when we change that
+	 * (we may feel 4TB maximum storage per drbd is not enough)
+	 */
+	typecheck(unsigned long, mdev->rs_total);
+
+	/* note: both rs_total and rs_left are in bits, i.e. in
+	 * units of BM_BLOCK_SIZE.
+	 * for the percentage, we don't care. */
+
+	*bits_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
+	/* >> 10 to prevent overflow,
+	 * +1 to prevent division by zero */
+	if (*bits_left > mdev->rs_total) {
+		/* doh. maybe a logic bug somewhere.
+		 * may also be just a race condition
+		 * between this and a disconnect during sync.
+		 * for now, just prevent in-kernel buffer overflow.
+		 */
+		smp_rmb();
+		dev_warn(DEV, "cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n",
+				drbd_conn_str(mdev->state.conn),
+				*bits_left, mdev->rs_total, mdev->rs_failed);
+		*per_mil_done = 0;
+	} else {
+		/* make sure the calculation happens in long context */
+		unsigned long tmp = 1000UL -
+				(*bits_left >> 10)*1000UL
+				/ ((mdev->rs_total >> 10) + 1UL);
+		*per_mil_done = tmp;
+	}
+}
+
+
+/* this throttles on-the-fly application requests
+ * according to max_buffers settings;
+ * maybe re-implement using semaphores? */
+static inline int drbd_get_max_buffers(struct drbd_conf *mdev)
+{
+	int mxb = 1000000; /* arbitrary limit on open requests */
+	if (get_net_conf(mdev)) {
+		mxb = mdev->net_conf->max_buffers;
+		put_net_conf(mdev);
+	}
+	return mxb;
+}
+
+static inline int drbd_state_is_stable(union drbd_state s)
+{
+
+	/* DO NOT add a default clause, we want the compiler to warn us
+	 * for any newly introduced state we may have forgotten to add here */
+
+	switch ((enum drbd_conns)s.conn) {
+	/* new io only accepted when there is no connection, ... */
+	case C_STANDALONE:
+	case C_WF_CONNECTION:
+	/* ... or there is a well established connection. */
+	case C_CONNECTED:
+	case C_SYNC_SOURCE:
+	case C_SYNC_TARGET:
+	case C_VERIFY_S:
+	case C_VERIFY_T:
+	case C_PAUSED_SYNC_S:
+	case C_PAUSED_SYNC_T:
+		/* maybe stable, look at the disk state */
+		break;
+
+	/* no new io accepted during tansitional states
+	 * like handshake or teardown */
+	case C_DISCONNECTING:
+	case C_UNCONNECTED:
+	case C_TIMEOUT:
+	case C_BROKEN_PIPE:
+	case C_NETWORK_FAILURE:
+	case C_PROTOCOL_ERROR:
+	case C_TEAR_DOWN:
+	case C_WF_REPORT_PARAMS:
+	case C_STARTING_SYNC_S:
+	case C_STARTING_SYNC_T:
+	case C_WF_BITMAP_S:
+	case C_WF_BITMAP_T:
+	case C_WF_SYNC_UUID:
+	case C_MASK:
+		/* not "stable" */
+		return 0;
+	}
+
+	switch ((enum drbd_disk_state)s.disk) {
+	case D_DISKLESS:
+	case D_INCONSISTENT:
+	case D_OUTDATED:
+	case D_CONSISTENT:
+	case D_UP_TO_DATE:
+		/* disk state is stable as well. */
+		break;
+
+	/* no new io accepted during tansitional states */
+	case D_ATTACHING:
+	case D_FAILED:
+	case D_NEGOTIATING:
+	case D_UNKNOWN:
+	case D_MASK:
+		/* not "stable" */
+		return 0;
+	}
+
+	return 1;
+}
+
+static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
+{
+	int mxb = drbd_get_max_buffers(mdev);
+
+	if (mdev->state.susp)
+		return 0;
+	if (test_bit(SUSPEND_IO, &mdev->flags))
+		return 0;
+
+	/* to avoid potential deadlock or bitmap corruption,
+	 * in various places, we only allow new application io
+	 * to start during "stable" states. */
+
+	/* no new io accepted when attaching or detaching the disk */
+	if (!drbd_state_is_stable(mdev->state))
+		return 0;
+
+	/* since some older kernels don't have atomic_add_unless,
+	 * and we are within the spinlock anyways, we have this workaround.  */
+	if (atomic_read(&mdev->ap_bio_cnt) > mxb)
+		return 0;
+	if (test_bit(BITMAP_IO, &mdev->flags))
+		return 0;
+	return 1;
+}
+
+/* I'd like to use wait_event_lock_irq,
+ * but I'm not sure when it got introduced,
+ * and not sure when it has 3 or 4 arguments */
+static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two)
+{
+	/* compare with after_state_ch,
+	 * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */
+	DEFINE_WAIT(wait);
+
+	/* we wait here
+	 *    as long as the device is suspended
+	 *    until the bitmap is no longer on the fly during connection
+	 *    handshake as long as we would exeed the max_buffer limit.
+	 *
+	 * to avoid races with the reconnect code,
+	 * we need to atomic_inc within the spinlock. */
+
+	spin_lock_irq(&mdev->req_lock);
+	while (!__inc_ap_bio_cond(mdev)) {
+		prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock_irq(&mdev->req_lock);
+		schedule();
+		finish_wait(&mdev->misc_wait, &wait);
+		spin_lock_irq(&mdev->req_lock);
+	}
+	atomic_add(one_or_two, &mdev->ap_bio_cnt);
+	spin_unlock_irq(&mdev->req_lock);
+}
+
+static inline void dec_ap_bio(struct drbd_conf *mdev)
+{
+	int mxb = drbd_get_max_buffers(mdev);
+	int ap_bio = atomic_dec_return(&mdev->ap_bio_cnt);
+
+	D_ASSERT(ap_bio >= 0);
+	/* this currently does wake_up for every dec_ap_bio!
+	 * maybe rather introduce some type of hysteresis?
+	 * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */
+	if (ap_bio < mxb)
+		wake_up(&mdev->misc_wait);
+	if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) {
+		if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
+			drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
+	}
+}
+
+static inline void drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
+{
+	mdev->ed_uuid = val;
+}
+
+static inline int seq_cmp(u32 a, u32 b)
+{
+	/* we assume wrap around at 32bit.
+	 * for wrap around at 24bit (old atomic_t),
+	 * we'd have to
+	 *  a <<= 8; b <<= 8;
+	 */
+	return (s32)(a) - (s32)(b);
+}
+#define seq_lt(a, b) (seq_cmp((a), (b)) < 0)
+#define seq_gt(a, b) (seq_cmp((a), (b)) > 0)
+#define seq_ge(a, b) (seq_cmp((a), (b)) >= 0)
+#define seq_le(a, b) (seq_cmp((a), (b)) <= 0)
+/* CAUTION: please no side effects in arguments! */
+#define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b)))
+
+static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq)
+{
+	unsigned int m;
+	spin_lock(&mdev->peer_seq_lock);
+	m = seq_max(mdev->peer_seq, new_seq);
+	mdev->peer_seq = m;
+	spin_unlock(&mdev->peer_seq_lock);
+	if (m == new_seq)
+		wake_up(&mdev->seq_wait);
+}
+
+static inline void drbd_update_congested(struct drbd_conf *mdev)
+{
+	struct sock *sk = mdev->data.socket->sk;
+	if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
+		set_bit(NET_CONGESTED, &mdev->flags);
+}
+
+static inline int drbd_queue_order_type(struct drbd_conf *mdev)
+{
+	/* sorry, we currently have no working implementation
+	 * of distributed TCQ stuff */
+#ifndef QUEUE_ORDERED_NONE
+#define QUEUE_ORDERED_NONE 0
+#endif
+	return QUEUE_ORDERED_NONE;
+}
+
+static inline void drbd_blk_run_queue(struct request_queue *q)
+{
+	if (q && q->unplug_fn)
+		q->unplug_fn(q);
+}
+
+static inline void drbd_kick_lo(struct drbd_conf *mdev)
+{
+	if (get_ldev(mdev)) {
+		drbd_blk_run_queue(bdev_get_queue(mdev->ldev->backing_bdev));
+		put_ldev(mdev);
+	}
+}
+
+static inline void drbd_md_flush(struct drbd_conf *mdev)
+{
+	int r;
+
+	if (test_bit(MD_NO_BARRIER, &mdev->flags))
+		return;
+
+	r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL);
+	if (r) {
+		set_bit(MD_NO_BARRIER, &mdev->flags);
+		dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
+	}
+}
+
+#endif
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
new file mode 100644
index 000000000000..edf0b8031e69
--- /dev/null
+++ b/drivers/block/drbd/drbd_main.c
@@ -0,0 +1,3735 @@
+/*
+   drbd.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
+   from Logicworks, Inc. for making SDP replication support possible.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/autoconf.h>
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/drbd.h>
+#include <asm/uaccess.h>
+#include <asm/types.h>
+#include <net/sock.h>
+#include <linux/ctype.h>
+#include <linux/smp_lock.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/reboot.h>
+#include <linux/notifier.h>
+#include <linux/kthread.h>
+
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+#include <linux/vmalloc.h>
+
+#include <linux/drbd_limits.h>
+#include "drbd_int.h"
+#include "drbd_tracing.h"
+#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
+
+#include "drbd_vli.h"
+
+struct after_state_chg_work {
+	struct drbd_work w;
+	union drbd_state os;
+	union drbd_state ns;
+	enum chg_state_flags flags;
+	struct completion *done;
+};
+
+int drbdd_init(struct drbd_thread *);
+int drbd_worker(struct drbd_thread *);
+int drbd_asender(struct drbd_thread *);
+
+int drbd_init(void);
+static int drbd_open(struct block_device *bdev, fmode_t mode);
+static int drbd_release(struct gendisk *gd, fmode_t mode);
+static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
+			   union drbd_state ns, enum chg_state_flags flags);
+static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+static void md_sync_timer_fn(unsigned long data);
+static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+
+DEFINE_TRACE(drbd_unplug);
+DEFINE_TRACE(drbd_uuid);
+DEFINE_TRACE(drbd_ee);
+DEFINE_TRACE(drbd_packet);
+DEFINE_TRACE(drbd_md_io);
+DEFINE_TRACE(drbd_epoch);
+DEFINE_TRACE(drbd_netlink);
+DEFINE_TRACE(drbd_actlog);
+DEFINE_TRACE(drbd_bio);
+DEFINE_TRACE(_drbd_resync);
+DEFINE_TRACE(drbd_req);
+
+MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
+	      "Lars Ellenberg <lars@linbit.com>");
+MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
+MODULE_VERSION(REL_VERSION);
+MODULE_LICENSE("GPL");
+MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
+MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
+
+#include <linux/moduleparam.h>
+/* allow_open_on_secondary */
+MODULE_PARM_DESC(allow_oos, "DONT USE!");
+/* thanks to these macros, if compiled into the kernel (not-module),
+ * this becomes the boot parameter drbd.minor_count */
+module_param(minor_count, uint, 0444);
+module_param(disable_sendpage, bool, 0644);
+module_param(allow_oos, bool, 0);
+module_param(cn_idx, uint, 0444);
+module_param(proc_details, int, 0644);
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+int enable_faults;
+int fault_rate;
+static int fault_count;
+int fault_devs;
+/* bitmap of enabled faults */
+module_param(enable_faults, int, 0664);
+/* fault rate % value - applies to all enabled faults */
+module_param(fault_rate, int, 0664);
+/* count of faults inserted */
+module_param(fault_count, int, 0664);
+/* bitmap of devices to insert faults on */
+module_param(fault_devs, int, 0644);
+#endif
+
+/* module parameter, defined */
+unsigned int minor_count = 32;
+int disable_sendpage;
+int allow_oos;
+unsigned int cn_idx = CN_IDX_DRBD;
+int proc_details;       /* Detail level in proc drbd*/
+
+/* Module parameter for setting the user mode helper program
+ * to run. Default is /sbin/drbdadm */
+char usermode_helper[80] = "/sbin/drbdadm";
+
+module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
+
+/* in 2.6.x, our device mapping and config info contains our virtual gendisks
+ * as member "struct gendisk *vdisk;"
+ */
+struct drbd_conf **minor_table;
+
+struct kmem_cache *drbd_request_cache;
+struct kmem_cache *drbd_ee_cache;	/* epoch entries */
+struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
+struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
+mempool_t *drbd_request_mempool;
+mempool_t *drbd_ee_mempool;
+
+/* I do not use a standard mempool, because:
+   1) I want to hand out the pre-allocated objects first.
+   2) I want to be able to interrupt sleeping allocation with a signal.
+   Note: This is a single linked list, the next pointer is the private
+	 member of struct page.
+ */
+struct page *drbd_pp_pool;
+spinlock_t   drbd_pp_lock;
+int          drbd_pp_vacant;
+wait_queue_head_t drbd_pp_wait;
+
+DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
+
+static struct block_device_operations drbd_ops = {
+	.owner =   THIS_MODULE,
+	.open =    drbd_open,
+	.release = drbd_release,
+};
+
+#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
+
+#ifdef __CHECKER__
+/* When checking with sparse, and this is an inline function, sparse will
+   give tons of false positives. When this is a real functions sparse works.
+ */
+int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
+{
+	int io_allowed;
+
+	atomic_inc(&mdev->local_cnt);
+	io_allowed = (mdev->state.disk >= mins);
+	if (!io_allowed) {
+		if (atomic_dec_and_test(&mdev->local_cnt))
+			wake_up(&mdev->misc_wait);
+	}
+	return io_allowed;
+}
+
+#endif
+
+/**
+ * DOC: The transfer log
+ *
+ * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
+ * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
+ * of the list. There is always at least one &struct drbd_tl_epoch object.
+ *
+ * Each &struct drbd_tl_epoch has a circular double linked list of requests
+ * attached.
+ */
+static int tl_init(struct drbd_conf *mdev)
+{
+	struct drbd_tl_epoch *b;
+
+	/* during device minor initialization, we may well use GFP_KERNEL */
+	b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
+	if (!b)
+		return 0;
+	INIT_LIST_HEAD(&b->requests);
+	INIT_LIST_HEAD(&b->w.list);
+	b->next = NULL;
+	b->br_number = 4711;
+	b->n_req = 0;
+	b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
+
+	mdev->oldest_tle = b;
+	mdev->newest_tle = b;
+	INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
+
+	mdev->tl_hash = NULL;
+	mdev->tl_hash_s = 0;
+
+	return 1;
+}
+
+static void tl_cleanup(struct drbd_conf *mdev)
+{
+	D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
+	D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
+	kfree(mdev->oldest_tle);
+	mdev->oldest_tle = NULL;
+	kfree(mdev->unused_spare_tle);
+	mdev->unused_spare_tle = NULL;
+	kfree(mdev->tl_hash);
+	mdev->tl_hash = NULL;
+	mdev->tl_hash_s = 0;
+}
+
+/**
+ * _tl_add_barrier() - Adds a barrier to the transfer log
+ * @mdev:	DRBD device.
+ * @new:	Barrier to be added before the current head of the TL.
+ *
+ * The caller must hold the req_lock.
+ */
+void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
+{
+	struct drbd_tl_epoch *newest_before;
+
+	INIT_LIST_HEAD(&new->requests);
+	INIT_LIST_HEAD(&new->w.list);
+	new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
+	new->next = NULL;
+	new->n_req = 0;
+
+	newest_before = mdev->newest_tle;
+	/* never send a barrier number == 0, because that is special-cased
+	 * when using TCQ for our write ordering code */
+	new->br_number = (newest_before->br_number+1) ?: 1;
+	if (mdev->newest_tle != new) {
+		mdev->newest_tle->next = new;
+		mdev->newest_tle = new;
+	}
+}
+
+/**
+ * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
+ * @mdev:	DRBD device.
+ * @barrier_nr:	Expected identifier of the DRBD write barrier packet.
+ * @set_size:	Expected number of requests before that barrier.
+ *
+ * In case the passed barrier_nr or set_size does not match the oldest
+ * &struct drbd_tl_epoch objects this function will cause a termination
+ * of the connection.
+ */
+void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
+		       unsigned int set_size)
+{
+	struct drbd_tl_epoch *b, *nob; /* next old barrier */
+	struct list_head *le, *tle;
+	struct drbd_request *r;
+
+	spin_lock_irq(&mdev->req_lock);
+
+	b = mdev->oldest_tle;
+
+	/* first some paranoia code */
+	if (b == NULL) {
+		dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
+			barrier_nr);
+		goto bail;
+	}
+	if (b->br_number != barrier_nr) {
+		dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
+			barrier_nr, b->br_number);
+		goto bail;
+	}
+	if (b->n_req != set_size) {
+		dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n",
+			barrier_nr, set_size, b->n_req);
+		goto bail;
+	}
+
+	/* Clean up list of requests processed during current epoch */
+	list_for_each_safe(le, tle, &b->requests) {
+		r = list_entry(le, struct drbd_request, tl_requests);
+		_req_mod(r, barrier_acked);
+	}
+	/* There could be requests on the list waiting for completion
+	   of the write to the local disk. To avoid corruptions of
+	   slab's data structures we have to remove the lists head.
+
+	   Also there could have been a barrier ack out of sequence, overtaking
+	   the write acks - which would be a bug and violating write ordering.
+	   To not deadlock in case we lose connection while such requests are
+	   still pending, we need some way to find them for the
+	   _req_mode(connection_lost_while_pending).
+
+	   These have been list_move'd to the out_of_sequence_requests list in
+	   _req_mod(, barrier_acked) above.
+	   */
+	list_del_init(&b->requests);
+
+	nob = b->next;
+	if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
+		_tl_add_barrier(mdev, b);
+		if (nob)
+			mdev->oldest_tle = nob;
+		/* if nob == NULL b was the only barrier, and becomes the new
+		   barrier. Therefore mdev->oldest_tle points already to b */
+	} else {
+		D_ASSERT(nob != NULL);
+		mdev->oldest_tle = nob;
+		kfree(b);
+	}
+
+	spin_unlock_irq(&mdev->req_lock);
+	dec_ap_pending(mdev);
+
+	return;
+
+bail:
+	spin_unlock_irq(&mdev->req_lock);
+	drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+}
+
+
+/**
+ * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
+ * @mdev:	DRBD device.
+ *
+ * This is called after the connection to the peer was lost. The storage covered
+ * by the requests on the transfer gets marked as our of sync. Called from the
+ * receiver thread and the worker thread.
+ */
+void tl_clear(struct drbd_conf *mdev)
+{
+	struct drbd_tl_epoch *b, *tmp;
+	struct list_head *le, *tle;
+	struct drbd_request *r;
+	int new_initial_bnr = net_random();
+
+	spin_lock_irq(&mdev->req_lock);
+
+	b = mdev->oldest_tle;
+	while (b) {
+		list_for_each_safe(le, tle, &b->requests) {
+			r = list_entry(le, struct drbd_request, tl_requests);
+			/* It would be nice to complete outside of spinlock.
+			 * But this is easier for now. */
+			_req_mod(r, connection_lost_while_pending);
+		}
+		tmp = b->next;
+
+		/* there could still be requests on that ring list,
+		 * in case local io is still pending */
+		list_del(&b->requests);
+
+		/* dec_ap_pending corresponding to queue_barrier.
+		 * the newest barrier may not have been queued yet,
+		 * in which case w.cb is still NULL. */
+		if (b->w.cb != NULL)
+			dec_ap_pending(mdev);
+
+		if (b == mdev->newest_tle) {
+			/* recycle, but reinit! */
+			D_ASSERT(tmp == NULL);
+			INIT_LIST_HEAD(&b->requests);
+			INIT_LIST_HEAD(&b->w.list);
+			b->w.cb = NULL;
+			b->br_number = new_initial_bnr;
+			b->n_req = 0;
+
+			mdev->oldest_tle = b;
+			break;
+		}
+		kfree(b);
+		b = tmp;
+	}
+
+	/* we expect this list to be empty. */
+	D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
+
+	/* but just in case, clean it up anyways! */
+	list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
+		r = list_entry(le, struct drbd_request, tl_requests);
+		/* It would be nice to complete outside of spinlock.
+		 * But this is easier for now. */
+		_req_mod(r, connection_lost_while_pending);
+	}
+
+	/* ensure bit indicating barrier is required is clear */
+	clear_bit(CREATE_BARRIER, &mdev->flags);
+
+	spin_unlock_irq(&mdev->req_lock);
+}
+
+/**
+ * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
+ * @mdev:	DRBD device.
+ * @os:		old (current) state.
+ * @ns:		new (wanted) state.
+ */
+static int cl_wide_st_chg(struct drbd_conf *mdev,
+			  union drbd_state os, union drbd_state ns)
+{
+	return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
+		 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
+		  (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
+		  (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
+		  (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
+		(os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
+		(os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
+}
+
+int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
+		      union drbd_state mask, union drbd_state val)
+{
+	unsigned long flags;
+	union drbd_state os, ns;
+	int rv;
+
+	spin_lock_irqsave(&mdev->req_lock, flags);
+	os = mdev->state;
+	ns.i = (os.i & ~mask.i) | val.i;
+	rv = _drbd_set_state(mdev, ns, f, NULL);
+	ns = mdev->state;
+	spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+	return rv;
+}
+
+/**
+ * drbd_force_state() - Impose a change which happens outside our control on our state
+ * @mdev:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ */
+void drbd_force_state(struct drbd_conf *mdev,
+	union drbd_state mask, union drbd_state val)
+{
+	drbd_change_state(mdev, CS_HARD, mask, val);
+}
+
+static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
+static int is_valid_state_transition(struct drbd_conf *,
+				     union drbd_state, union drbd_state);
+static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
+				       union drbd_state ns, int *warn_sync_abort);
+int drbd_send_state_req(struct drbd_conf *,
+			union drbd_state, union drbd_state);
+
+static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
+				    union drbd_state mask, union drbd_state val)
+{
+	union drbd_state os, ns;
+	unsigned long flags;
+	int rv;
+
+	if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
+		return SS_CW_SUCCESS;
+
+	if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
+		return SS_CW_FAILED_BY_PEER;
+
+	rv = 0;
+	spin_lock_irqsave(&mdev->req_lock, flags);
+	os = mdev->state;
+	ns.i = (os.i & ~mask.i) | val.i;
+	ns = sanitize_state(mdev, os, ns, NULL);
+
+	if (!cl_wide_st_chg(mdev, os, ns))
+		rv = SS_CW_NO_NEED;
+	if (!rv) {
+		rv = is_valid_state(mdev, ns);
+		if (rv == SS_SUCCESS) {
+			rv = is_valid_state_transition(mdev, ns, os);
+			if (rv == SS_SUCCESS)
+				rv = 0; /* cont waiting, otherwise fail. */
+		}
+	}
+	spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+	return rv;
+}
+
+/**
+ * drbd_req_state() - Perform an eventually cluster wide state change
+ * @mdev:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ * @f:		flags
+ *
+ * Should not be called directly, use drbd_request_state() or
+ * _drbd_request_state().
+ */
+static int drbd_req_state(struct drbd_conf *mdev,
+			  union drbd_state mask, union drbd_state val,
+			  enum chg_state_flags f)
+{
+	struct completion done;
+	unsigned long flags;
+	union drbd_state os, ns;
+	int rv;
+
+	init_completion(&done);
+
+	if (f & CS_SERIALIZE)
+		mutex_lock(&mdev->state_mutex);
+
+	spin_lock_irqsave(&mdev->req_lock, flags);
+	os = mdev->state;
+	ns.i = (os.i & ~mask.i) | val.i;
+	ns = sanitize_state(mdev, os, ns, NULL);
+
+	if (cl_wide_st_chg(mdev, os, ns)) {
+		rv = is_valid_state(mdev, ns);
+		if (rv == SS_SUCCESS)
+			rv = is_valid_state_transition(mdev, ns, os);
+		spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+		if (rv < SS_SUCCESS) {
+			if (f & CS_VERBOSE)
+				print_st_err(mdev, os, ns, rv);
+			goto abort;
+		}
+
+		drbd_state_lock(mdev);
+		if (!drbd_send_state_req(mdev, mask, val)) {
+			drbd_state_unlock(mdev);
+			rv = SS_CW_FAILED_BY_PEER;
+			if (f & CS_VERBOSE)
+				print_st_err(mdev, os, ns, rv);
+			goto abort;
+		}
+
+		wait_event(mdev->state_wait,
+			(rv = _req_st_cond(mdev, mask, val)));
+
+		if (rv < SS_SUCCESS) {
+			drbd_state_unlock(mdev);
+			if (f & CS_VERBOSE)
+				print_st_err(mdev, os, ns, rv);
+			goto abort;
+		}
+		spin_lock_irqsave(&mdev->req_lock, flags);
+		os = mdev->state;
+		ns.i = (os.i & ~mask.i) | val.i;
+		rv = _drbd_set_state(mdev, ns, f, &done);
+		drbd_state_unlock(mdev);
+	} else {
+		rv = _drbd_set_state(mdev, ns, f, &done);
+	}
+
+	spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+	if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
+		D_ASSERT(current != mdev->worker.task);
+		wait_for_completion(&done);
+	}
+
+abort:
+	if (f & CS_SERIALIZE)
+		mutex_unlock(&mdev->state_mutex);
+
+	return rv;
+}
+
+/**
+ * _drbd_request_state() - Request a state change (with flags)
+ * @mdev:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ * @f:		flags
+ *
+ * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
+ * flag, or when logging of failed state change requests is not desired.
+ */
+int _drbd_request_state(struct drbd_conf *mdev,	union drbd_state mask,
+			union drbd_state val,	enum chg_state_flags f)
+{
+	int rv;
+
+	wait_event(mdev->state_wait,
+		   (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
+
+	return rv;
+}
+
+static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
+{
+	dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
+	    name,
+	    drbd_conn_str(ns.conn),
+	    drbd_role_str(ns.role),
+	    drbd_role_str(ns.peer),
+	    drbd_disk_str(ns.disk),
+	    drbd_disk_str(ns.pdsk),
+	    ns.susp ? 's' : 'r',
+	    ns.aftr_isp ? 'a' : '-',
+	    ns.peer_isp ? 'p' : '-',
+	    ns.user_isp ? 'u' : '-'
+	    );
+}
+
+void print_st_err(struct drbd_conf *mdev,
+	union drbd_state os, union drbd_state ns, int err)
+{
+	if (err == SS_IN_TRANSIENT_STATE)
+		return;
+	dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
+	print_st(mdev, " state", os);
+	print_st(mdev, "wanted", ns);
+}
+
+
+#define drbd_peer_str drbd_role_str
+#define drbd_pdsk_str drbd_disk_str
+
+#define drbd_susp_str(A)     ((A) ? "1" : "0")
+#define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
+#define drbd_peer_isp_str(A) ((A) ? "1" : "0")
+#define drbd_user_isp_str(A) ((A) ? "1" : "0")
+
+#define PSC(A) \
+	({ if (ns.A != os.A) { \
+		pbp += sprintf(pbp, #A "( %s -> %s ) ", \
+			      drbd_##A##_str(os.A), \
+			      drbd_##A##_str(ns.A)); \
+	} })
+
+/**
+ * is_valid_state() - Returns an SS_ error code if ns is not valid
+ * @mdev:	DRBD device.
+ * @ns:		State to consider.
+ */
+static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
+{
+	/* See drbd_state_sw_errors in drbd_strings.c */
+
+	enum drbd_fencing_p fp;
+	int rv = SS_SUCCESS;
+
+	fp = FP_DONT_CARE;
+	if (get_ldev(mdev)) {
+		fp = mdev->ldev->dc.fencing;
+		put_ldev(mdev);
+	}
+
+	if (get_net_conf(mdev)) {
+		if (!mdev->net_conf->two_primaries &&
+		    ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
+			rv = SS_TWO_PRIMARIES;
+		put_net_conf(mdev);
+	}
+
+	if (rv <= 0)
+		/* already found a reason to abort */;
+	else if (ns.role == R_SECONDARY && mdev->open_cnt)
+		rv = SS_DEVICE_IN_USE;
+
+	else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
+		rv = SS_NO_UP_TO_DATE_DISK;
+
+	else if (fp >= FP_RESOURCE &&
+		 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
+		rv = SS_PRIMARY_NOP;
+
+	else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
+		rv = SS_NO_UP_TO_DATE_DISK;
+
+	else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
+		rv = SS_NO_LOCAL_DISK;
+
+	else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
+		rv = SS_NO_REMOTE_DISK;
+
+	else if ((ns.conn == C_CONNECTED ||
+		  ns.conn == C_WF_BITMAP_S ||
+		  ns.conn == C_SYNC_SOURCE ||
+		  ns.conn == C_PAUSED_SYNC_S) &&
+		  ns.disk == D_OUTDATED)
+		rv = SS_CONNECTED_OUTDATES;
+
+	else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+		 (mdev->sync_conf.verify_alg[0] == 0))
+		rv = SS_NO_VERIFY_ALG;
+
+	else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+		  mdev->agreed_pro_version < 88)
+		rv = SS_NOT_SUPPORTED;
+
+	return rv;
+}
+
+/**
+ * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
+ * @mdev:	DRBD device.
+ * @ns:		new state.
+ * @os:		old state.
+ */
+static int is_valid_state_transition(struct drbd_conf *mdev,
+				     union drbd_state ns, union drbd_state os)
+{
+	int rv = SS_SUCCESS;
+
+	if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
+	    os.conn > C_CONNECTED)
+		rv = SS_RESYNC_RUNNING;
+
+	if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
+		rv = SS_ALREADY_STANDALONE;
+
+	if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
+		rv = SS_IS_DISKLESS;
+
+	if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
+		rv = SS_NO_NET_CONFIG;
+
+	if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
+		rv = SS_LOWER_THAN_OUTDATED;
+
+	if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
+		rv = SS_IN_TRANSIENT_STATE;
+
+	if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
+		rv = SS_IN_TRANSIENT_STATE;
+
+	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
+		rv = SS_NEED_CONNECTION;
+
+	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+	    ns.conn != os.conn && os.conn > C_CONNECTED)
+		rv = SS_RESYNC_RUNNING;
+
+	if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
+	    os.conn < C_CONNECTED)
+		rv = SS_NEED_CONNECTION;
+
+	return rv;
+}
+
+/**
+ * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
+ * @mdev:	DRBD device.
+ * @os:		old state.
+ * @ns:		new state.
+ * @warn_sync_abort:
+ *
+ * When we loose connection, we have to set the state of the peers disk (pdsk)
+ * to D_UNKNOWN. This rule and many more along those lines are in this function.
+ */
+static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
+				       union drbd_state ns, int *warn_sync_abort)
+{
+	enum drbd_fencing_p fp;
+
+	fp = FP_DONT_CARE;
+	if (get_ldev(mdev)) {
+		fp = mdev->ldev->dc.fencing;
+		put_ldev(mdev);
+	}
+
+	/* Disallow Network errors to configure a device's network part */
+	if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
+	    os.conn <= C_DISCONNECTING)
+		ns.conn = os.conn;
+
+	/* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */
+	if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
+	    ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING)
+		ns.conn = os.conn;
+
+	/* After C_DISCONNECTING only C_STANDALONE may follow */
+	if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
+		ns.conn = os.conn;
+
+	if (ns.conn < C_CONNECTED) {
+		ns.peer_isp = 0;
+		ns.peer = R_UNKNOWN;
+		if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
+			ns.pdsk = D_UNKNOWN;
+	}
+
+	/* Clear the aftr_isp when becoming unconfigured */
+	if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
+		ns.aftr_isp = 0;
+
+	if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS)
+		ns.pdsk = D_UNKNOWN;
+
+	/* Abort resync if a disk fails/detaches */
+	if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
+	    (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
+		if (warn_sync_abort)
+			*warn_sync_abort = 1;
+		ns.conn = C_CONNECTED;
+	}
+
+	if (ns.conn >= C_CONNECTED &&
+	    ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) ||
+	     (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) {
+		switch (ns.conn) {
+		case C_WF_BITMAP_T:
+		case C_PAUSED_SYNC_T:
+			ns.disk = D_OUTDATED;
+			break;
+		case C_CONNECTED:
+		case C_WF_BITMAP_S:
+		case C_SYNC_SOURCE:
+		case C_PAUSED_SYNC_S:
+			ns.disk = D_UP_TO_DATE;
+			break;
+		case C_SYNC_TARGET:
+			ns.disk = D_INCONSISTENT;
+			dev_warn(DEV, "Implicitly set disk state Inconsistent!\n");
+			break;
+		}
+		if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE)
+			dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n");
+	}
+
+	if (ns.conn >= C_CONNECTED &&
+	    (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) {
+		switch (ns.conn) {
+		case C_CONNECTED:
+		case C_WF_BITMAP_T:
+		case C_PAUSED_SYNC_T:
+		case C_SYNC_TARGET:
+			ns.pdsk = D_UP_TO_DATE;
+			break;
+		case C_WF_BITMAP_S:
+		case C_PAUSED_SYNC_S:
+			ns.pdsk = D_OUTDATED;
+			break;
+		case C_SYNC_SOURCE:
+			ns.pdsk = D_INCONSISTENT;
+			dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n");
+			break;
+		}
+		if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE)
+			dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n");
+	}
+
+	/* Connection breaks down before we finished "Negotiating" */
+	if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
+	    get_ldev_if_state(mdev, D_NEGOTIATING)) {
+		if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
+			ns.disk = mdev->new_state_tmp.disk;
+			ns.pdsk = mdev->new_state_tmp.pdsk;
+		} else {
+			dev_alert(DEV, "Connection lost while negotiating, no data!\n");
+			ns.disk = D_DISKLESS;
+			ns.pdsk = D_UNKNOWN;
+		}
+		put_ldev(mdev);
+	}
+
+	if (fp == FP_STONITH &&
+	    (ns.role == R_PRIMARY &&
+	     ns.conn < C_CONNECTED &&
+	     ns.pdsk > D_OUTDATED))
+			ns.susp = 1;
+
+	if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
+		if (ns.conn == C_SYNC_SOURCE)
+			ns.conn = C_PAUSED_SYNC_S;
+		if (ns.conn == C_SYNC_TARGET)
+			ns.conn = C_PAUSED_SYNC_T;
+	} else {
+		if (ns.conn == C_PAUSED_SYNC_S)
+			ns.conn = C_SYNC_SOURCE;
+		if (ns.conn == C_PAUSED_SYNC_T)
+			ns.conn = C_SYNC_TARGET;
+	}
+
+	return ns;
+}
+
+/* helper for __drbd_set_state */
+static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
+{
+	if (cs == C_VERIFY_T) {
+		/* starting online verify from an arbitrary position
+		 * does not fit well into the existing protocol.
+		 * on C_VERIFY_T, we initialize ov_left and friends
+		 * implicitly in receive_DataRequest once the
+		 * first P_OV_REQUEST is received */
+		mdev->ov_start_sector = ~(sector_t)0;
+	} else {
+		unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
+		if (bit >= mdev->rs_total)
+			mdev->ov_start_sector =
+				BM_BIT_TO_SECT(mdev->rs_total - 1);
+		mdev->ov_position = mdev->ov_start_sector;
+	}
+}
+
+/**
+ * __drbd_set_state() - Set a new DRBD state
+ * @mdev:	DRBD device.
+ * @ns:		new state.
+ * @flags:	Flags
+ * @done:	Optional completion, that will get completed after the after_state_ch() finished
+ *
+ * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
+ */
+int __drbd_set_state(struct drbd_conf *mdev,
+		    union drbd_state ns, enum chg_state_flags flags,
+		    struct completion *done)
+{
+	union drbd_state os;
+	int rv = SS_SUCCESS;
+	int warn_sync_abort = 0;
+	struct after_state_chg_work *ascw;
+
+	os = mdev->state;
+
+	ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
+
+	if (ns.i == os.i)
+		return SS_NOTHING_TO_DO;
+
+	if (!(flags & CS_HARD)) {
+		/*  pre-state-change checks ; only look at ns  */
+		/* See drbd_state_sw_errors in drbd_strings.c */
+
+		rv = is_valid_state(mdev, ns);
+		if (rv < SS_SUCCESS) {
+			/* If the old state was illegal as well, then let
+			   this happen...*/
+
+			if (is_valid_state(mdev, os) == rv) {
+				dev_err(DEV, "Considering state change from bad state. "
+				    "Error would be: '%s'\n",
+				    drbd_set_st_err_str(rv));
+				print_st(mdev, "old", os);
+				print_st(mdev, "new", ns);
+				rv = is_valid_state_transition(mdev, ns, os);
+			}
+		} else
+			rv = is_valid_state_transition(mdev, ns, os);
+	}
+
+	if (rv < SS_SUCCESS) {
+		if (flags & CS_VERBOSE)
+			print_st_err(mdev, os, ns, rv);
+		return rv;
+	}
+
+	if (warn_sync_abort)
+		dev_warn(DEV, "Resync aborted.\n");
+
+	{
+		char *pbp, pb[300];
+		pbp = pb;
+		*pbp = 0;
+		PSC(role);
+		PSC(peer);
+		PSC(conn);
+		PSC(disk);
+		PSC(pdsk);
+		PSC(susp);
+		PSC(aftr_isp);
+		PSC(peer_isp);
+		PSC(user_isp);
+		dev_info(DEV, "%s\n", pb);
+	}
+
+	/* solve the race between becoming unconfigured,
+	 * worker doing the cleanup, and
+	 * admin reconfiguring us:
+	 * on (re)configure, first set CONFIG_PENDING,
+	 * then wait for a potentially exiting worker,
+	 * start the worker, and schedule one no_op.
+	 * then proceed with configuration.
+	 */
+	if (ns.disk == D_DISKLESS &&
+	    ns.conn == C_STANDALONE &&
+	    ns.role == R_SECONDARY &&
+	    !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
+		set_bit(DEVICE_DYING, &mdev->flags);
+
+	mdev->state.i = ns.i;
+	wake_up(&mdev->misc_wait);
+	wake_up(&mdev->state_wait);
+
+	/*   post-state-change actions   */
+	if (os.conn >= C_SYNC_SOURCE   && ns.conn <= C_CONNECTED) {
+		set_bit(STOP_SYNC_TIMER, &mdev->flags);
+		mod_timer(&mdev->resync_timer, jiffies);
+	}
+
+	/* aborted verify run. log the last position */
+	if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
+	    ns.conn < C_CONNECTED) {
+		mdev->ov_start_sector =
+			BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left);
+		dev_info(DEV, "Online Verify reached sector %llu\n",
+			(unsigned long long)mdev->ov_start_sector);
+	}
+
+	if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
+	    (ns.conn == C_SYNC_TARGET  || ns.conn == C_SYNC_SOURCE)) {
+		dev_info(DEV, "Syncer continues.\n");
+		mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time;
+		if (ns.conn == C_SYNC_TARGET) {
+			if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))
+				mod_timer(&mdev->resync_timer, jiffies);
+			/* This if (!test_bit) is only needed for the case
+			   that a device that has ceased to used its timer,
+			   i.e. it is already in drbd_resync_finished() gets
+			   paused and resumed. */
+		}
+	}
+
+	if ((os.conn == C_SYNC_TARGET  || os.conn == C_SYNC_SOURCE) &&
+	    (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
+		dev_info(DEV, "Resync suspended\n");
+		mdev->rs_mark_time = jiffies;
+		if (ns.conn == C_PAUSED_SYNC_T)
+			set_bit(STOP_SYNC_TIMER, &mdev->flags);
+	}
+
+	if (os.conn == C_CONNECTED &&
+	    (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
+		mdev->ov_position = 0;
+		mdev->rs_total =
+		mdev->rs_mark_left = drbd_bm_bits(mdev);
+		if (mdev->agreed_pro_version >= 90)
+			set_ov_position(mdev, ns.conn);
+		else
+			mdev->ov_start_sector = 0;
+		mdev->ov_left = mdev->rs_total
+			      - BM_SECT_TO_BIT(mdev->ov_position);
+		mdev->rs_start     =
+		mdev->rs_mark_time = jiffies;
+		mdev->ov_last_oos_size = 0;
+		mdev->ov_last_oos_start = 0;
+
+		if (ns.conn == C_VERIFY_S) {
+			dev_info(DEV, "Starting Online Verify from sector %llu\n",
+					(unsigned long long)mdev->ov_position);
+			mod_timer(&mdev->resync_timer, jiffies);
+		}
+	}
+
+	if (get_ldev(mdev)) {
+		u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
+						 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
+						 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
+
+		if (test_bit(CRASHED_PRIMARY, &mdev->flags))
+			mdf |= MDF_CRASHED_PRIMARY;
+		if (mdev->state.role == R_PRIMARY ||
+		    (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
+			mdf |= MDF_PRIMARY_IND;
+		if (mdev->state.conn > C_WF_REPORT_PARAMS)
+			mdf |= MDF_CONNECTED_IND;
+		if (mdev->state.disk > D_INCONSISTENT)
+			mdf |= MDF_CONSISTENT;
+		if (mdev->state.disk > D_OUTDATED)
+			mdf |= MDF_WAS_UP_TO_DATE;
+		if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
+			mdf |= MDF_PEER_OUT_DATED;
+		if (mdf != mdev->ldev->md.flags) {
+			mdev->ldev->md.flags = mdf;
+			drbd_md_mark_dirty(mdev);
+		}
+		if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
+			drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
+		put_ldev(mdev);
+	}
+
+	/* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
+	if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
+	    os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
+		set_bit(CONSIDER_RESYNC, &mdev->flags);
+
+	/* Receiver should clean up itself */
+	if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
+		drbd_thread_stop_nowait(&mdev->receiver);
+
+	/* Now the receiver finished cleaning up itself, it should die */
+	if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
+		drbd_thread_stop_nowait(&mdev->receiver);
+
+	/* Upon network failure, we need to restart the receiver. */
+	if (os.conn > C_TEAR_DOWN &&
+	    ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
+		drbd_thread_restart_nowait(&mdev->receiver);
+
+	ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
+	if (ascw) {
+		ascw->os = os;
+		ascw->ns = ns;
+		ascw->flags = flags;
+		ascw->w.cb = w_after_state_ch;
+		ascw->done = done;
+		drbd_queue_work(&mdev->data.work, &ascw->w);
+	} else {
+		dev_warn(DEV, "Could not kmalloc an ascw\n");
+	}
+
+	return rv;
+}
+
+static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+	struct after_state_chg_work *ascw =
+		container_of(w, struct after_state_chg_work, w);
+	after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
+	if (ascw->flags & CS_WAIT_COMPLETE) {
+		D_ASSERT(ascw->done != NULL);
+		complete(ascw->done);
+	}
+	kfree(ascw);
+
+	return 1;
+}
+
+static void abw_start_sync(struct drbd_conf *mdev, int rv)
+{
+	if (rv) {
+		dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
+		_drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
+		return;
+	}
+
+	switch (mdev->state.conn) {
+	case C_STARTING_SYNC_T:
+		_drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
+		break;
+	case C_STARTING_SYNC_S:
+		drbd_start_resync(mdev, C_SYNC_SOURCE);
+		break;
+	}
+}
+
+/**
+ * after_state_ch() - Perform after state change actions that may sleep
+ * @mdev:	DRBD device.
+ * @os:		old state.
+ * @ns:		new state.
+ * @flags:	Flags
+ */
+static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
+			   union drbd_state ns, enum chg_state_flags flags)
+{
+	enum drbd_fencing_p fp;
+
+	if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
+		clear_bit(CRASHED_PRIMARY, &mdev->flags);
+		if (mdev->p_uuid)
+			mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
+	}
+
+	fp = FP_DONT_CARE;
+	if (get_ldev(mdev)) {
+		fp = mdev->ldev->dc.fencing;
+		put_ldev(mdev);
+	}
+
+	/* Inform userspace about the change... */
+	drbd_bcast_state(mdev, ns);
+
+	if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
+	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
+		drbd_khelper(mdev, "pri-on-incon-degr");
+
+	/* Here we have the actions that are performed after a
+	   state change. This function might sleep */
+
+	if (fp == FP_STONITH && ns.susp) {
+		/* case1: The outdate peer handler is successful:
+		 * case2: The connection was established again: */
+		if ((os.pdsk > D_OUTDATED  && ns.pdsk <= D_OUTDATED) ||
+		    (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) {
+			tl_clear(mdev);
+			spin_lock_irq(&mdev->req_lock);
+			_drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
+			spin_unlock_irq(&mdev->req_lock);
+		}
+	}
+	/* Do not change the order of the if above and the two below... */
+	if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) {      /* attach on the peer */
+		drbd_send_uuids(mdev);
+		drbd_send_state(mdev);
+	}
+	if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
+		drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
+
+	/* Lost contact to peer's copy of the data */
+	if ((os.pdsk >= D_INCONSISTENT &&
+	     os.pdsk != D_UNKNOWN &&
+	     os.pdsk != D_OUTDATED)
+	&&  (ns.pdsk < D_INCONSISTENT ||
+	     ns.pdsk == D_UNKNOWN ||
+	     ns.pdsk == D_OUTDATED)) {
+		kfree(mdev->p_uuid);
+		mdev->p_uuid = NULL;
+		if (get_ldev(mdev)) {
+			if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
+			    mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
+				drbd_uuid_new_current(mdev);
+				drbd_send_uuids(mdev);
+			}
+			put_ldev(mdev);
+		}
+	}
+
+	if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
+		if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0)
+			drbd_uuid_new_current(mdev);
+
+		/* D_DISKLESS Peer becomes secondary */
+		if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
+			drbd_al_to_on_disk_bm(mdev);
+		put_ldev(mdev);
+	}
+
+	/* Last part of the attaching process ... */
+	if (ns.conn >= C_CONNECTED &&
+	    os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
+		kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */
+		mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */
+		drbd_send_sizes(mdev, 0);  /* to start sync... */
+		drbd_send_uuids(mdev);
+		drbd_send_state(mdev);
+	}
+
+	/* We want to pause/continue resync, tell peer. */
+	if (ns.conn >= C_CONNECTED &&
+	     ((os.aftr_isp != ns.aftr_isp) ||
+	      (os.user_isp != ns.user_isp)))
+		drbd_send_state(mdev);
+
+	/* In case one of the isp bits got set, suspend other devices. */
+	if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
+	    (ns.aftr_isp || ns.peer_isp || ns.user_isp))
+		suspend_other_sg(mdev);
+
+	/* Make sure the peer gets informed about eventual state
+	   changes (ISP bits) while we were in WFReportParams. */
+	if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
+		drbd_send_state(mdev);
+
+	/* We are in the progress to start a full sync... */
+	if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
+	    (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
+		drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
+
+	/* We are invalidating our self... */
+	if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
+	    os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
+		drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
+
+	if (os.disk > D_FAILED && ns.disk == D_FAILED) {
+		enum drbd_io_error_p eh;
+
+		eh = EP_PASS_ON;
+		if (get_ldev_if_state(mdev, D_FAILED)) {
+			eh = mdev->ldev->dc.on_io_error;
+			put_ldev(mdev);
+		}
+
+		drbd_rs_cancel_all(mdev);
+		/* since get_ldev() only works as long as disk>=D_INCONSISTENT,
+		   and it is D_DISKLESS here, local_cnt can only go down, it can
+		   not increase... It will reach zero */
+		wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
+		mdev->rs_total = 0;
+		mdev->rs_failed = 0;
+		atomic_set(&mdev->rs_pending_cnt, 0);
+
+		spin_lock_irq(&mdev->req_lock);
+		_drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL);
+		spin_unlock_irq(&mdev->req_lock);
+
+		if (eh == EP_CALL_HELPER)
+			drbd_khelper(mdev, "local-io-error");
+	}
+
+	if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
+
+		if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ {
+			if (drbd_send_state(mdev))
+				dev_warn(DEV, "Notified peer that my disk is broken.\n");
+			else
+				dev_err(DEV, "Sending state in drbd_io_error() failed\n");
+		}
+
+		lc_destroy(mdev->resync);
+		mdev->resync = NULL;
+		lc_destroy(mdev->act_log);
+		mdev->act_log = NULL;
+		__no_warn(local,
+			drbd_free_bc(mdev->ldev);
+			mdev->ldev = NULL;);
+
+		if (mdev->md_io_tmpp)
+			__free_page(mdev->md_io_tmpp);
+	}
+
+	/* Disks got bigger while they were detached */
+	if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
+	    test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
+		if (ns.conn == C_CONNECTED)
+			resync_after_online_grow(mdev);
+	}
+
+	/* A resync finished or aborted, wake paused devices... */
+	if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
+	    (os.peer_isp && !ns.peer_isp) ||
+	    (os.user_isp && !ns.user_isp))
+		resume_next_sg(mdev);
+
+	/* Upon network connection, we need to start the receiver */
+	if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
+		drbd_thread_start(&mdev->receiver);
+
+	/* Terminate worker thread if we are unconfigured - it will be
+	   restarted as needed... */
+	if (ns.disk == D_DISKLESS &&
+	    ns.conn == C_STANDALONE &&
+	    ns.role == R_SECONDARY) {
+		if (os.aftr_isp != ns.aftr_isp)
+			resume_next_sg(mdev);
+		/* set in __drbd_set_state, unless CONFIG_PENDING was set */
+		if (test_bit(DEVICE_DYING, &mdev->flags))
+			drbd_thread_stop_nowait(&mdev->worker);
+	}
+
+	drbd_md_sync(mdev);
+}
+
+
+static int drbd_thread_setup(void *arg)
+{
+	struct drbd_thread *thi = (struct drbd_thread *) arg;
+	struct drbd_conf *mdev = thi->mdev;
+	unsigned long flags;
+	int retval;
+
+restart:
+	retval = thi->function(thi);
+
+	spin_lock_irqsave(&thi->t_lock, flags);
+
+	/* if the receiver has been "Exiting", the last thing it did
+	 * was set the conn state to "StandAlone",
+	 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
+	 * and receiver thread will be "started".
+	 * drbd_thread_start needs to set "Restarting" in that case.
+	 * t_state check and assignment needs to be within the same spinlock,
+	 * so either thread_start sees Exiting, and can remap to Restarting,
+	 * or thread_start see None, and can proceed as normal.
+	 */
+
+	if (thi->t_state == Restarting) {
+		dev_info(DEV, "Restarting %s\n", current->comm);
+		thi->t_state = Running;
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		goto restart;
+	}
+
+	thi->task = NULL;
+	thi->t_state = None;
+	smp_mb();
+	complete(&thi->stop);
+	spin_unlock_irqrestore(&thi->t_lock, flags);
+
+	dev_info(DEV, "Terminating %s\n", current->comm);
+
+	/* Release mod reference taken when thread was started */
+	module_put(THIS_MODULE);
+	return retval;
+}
+
+static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
+		      int (*func) (struct drbd_thread *))
+{
+	spin_lock_init(&thi->t_lock);
+	thi->task    = NULL;
+	thi->t_state = None;
+	thi->function = func;
+	thi->mdev = mdev;
+}
+
+int drbd_thread_start(struct drbd_thread *thi)
+{
+	struct drbd_conf *mdev = thi->mdev;
+	struct task_struct *nt;
+	unsigned long flags;
+
+	const char *me =
+		thi == &mdev->receiver ? "receiver" :
+		thi == &mdev->asender  ? "asender"  :
+		thi == &mdev->worker   ? "worker"   : "NONSENSE";
+
+	/* is used from state engine doing drbd_thread_stop_nowait,
+	 * while holding the req lock irqsave */
+	spin_lock_irqsave(&thi->t_lock, flags);
+
+	switch (thi->t_state) {
+	case None:
+		dev_info(DEV, "Starting %s thread (from %s [%d])\n",
+				me, current->comm, current->pid);
+
+		/* Get ref on module for thread - this is released when thread exits */
+		if (!try_module_get(THIS_MODULE)) {
+			dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
+			spin_unlock_irqrestore(&thi->t_lock, flags);
+			return FALSE;
+		}
+
+		init_completion(&thi->stop);
+		D_ASSERT(thi->task == NULL);
+		thi->reset_cpu_mask = 1;
+		thi->t_state = Running;
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
+
+		nt = kthread_create(drbd_thread_setup, (void *) thi,
+				    "drbd%d_%s", mdev_to_minor(mdev), me);
+
+		if (IS_ERR(nt)) {
+			dev_err(DEV, "Couldn't start thread\n");
+
+			module_put(THIS_MODULE);
+			return FALSE;
+		}
+		spin_lock_irqsave(&thi->t_lock, flags);
+		thi->task = nt;
+		thi->t_state = Running;
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		wake_up_process(nt);
+		break;
+	case Exiting:
+		thi->t_state = Restarting;
+		dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
+				me, current->comm, current->pid);
+		/* fall through */
+	case Running:
+	case Restarting:
+	default:
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		break;
+	}
+
+	return TRUE;
+}
+
+
+void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
+{
+	unsigned long flags;
+
+	enum drbd_thread_state ns = restart ? Restarting : Exiting;
+
+	/* may be called from state engine, holding the req lock irqsave */
+	spin_lock_irqsave(&thi->t_lock, flags);
+
+	if (thi->t_state == None) {
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		if (restart)
+			drbd_thread_start(thi);
+		return;
+	}
+
+	if (thi->t_state != ns) {
+		if (thi->task == NULL) {
+			spin_unlock_irqrestore(&thi->t_lock, flags);
+			return;
+		}
+
+		thi->t_state = ns;
+		smp_mb();
+		init_completion(&thi->stop);
+		if (thi->task != current)
+			force_sig(DRBD_SIGKILL, thi->task);
+
+	}
+
+	spin_unlock_irqrestore(&thi->t_lock, flags);
+
+	if (wait)
+		wait_for_completion(&thi->stop);
+}
+
+#ifdef CONFIG_SMP
+/**
+ * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
+ * @mdev:	DRBD device.
+ *
+ * Forces all threads of a device onto the same CPU. This is beneficial for
+ * DRBD's performance. May be overwritten by user's configuration.
+ */
+void drbd_calc_cpu_mask(struct drbd_conf *mdev)
+{
+	int ord, cpu;
+
+	/* user override. */
+	if (cpumask_weight(mdev->cpu_mask))
+		return;
+
+	ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
+	for_each_online_cpu(cpu) {
+		if (ord-- == 0) {
+			cpumask_set_cpu(cpu, mdev->cpu_mask);
+			return;
+		}
+	}
+	/* should not be reached */
+	cpumask_setall(mdev->cpu_mask);
+}
+
+/**
+ * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
+ * @mdev:	DRBD device.
+ *
+ * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
+ * prematurely.
+ */
+void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
+{
+	struct task_struct *p = current;
+	struct drbd_thread *thi =
+		p == mdev->asender.task  ? &mdev->asender  :
+		p == mdev->receiver.task ? &mdev->receiver :
+		p == mdev->worker.task   ? &mdev->worker   :
+		NULL;
+	ERR_IF(thi == NULL)
+		return;
+	if (!thi->reset_cpu_mask)
+		return;
+	thi->reset_cpu_mask = 0;
+	set_cpus_allowed_ptr(p, mdev->cpu_mask);
+}
+#endif
+
+/* the appropriate socket mutex must be held already */
+int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
+			  enum drbd_packets cmd, struct p_header *h,
+			  size_t size, unsigned msg_flags)
+{
+	int sent, ok;
+
+	ERR_IF(!h) return FALSE;
+	ERR_IF(!size) return FALSE;
+
+	h->magic   = BE_DRBD_MAGIC;
+	h->command = cpu_to_be16(cmd);
+	h->length  = cpu_to_be16(size-sizeof(struct p_header));
+
+	trace_drbd_packet(mdev, sock, 0, (void *)h, __FILE__, __LINE__);
+	sent = drbd_send(mdev, sock, h, size, msg_flags);
+
+	ok = (sent == size);
+	if (!ok)
+		dev_err(DEV, "short sent %s size=%d sent=%d\n",
+		    cmdname(cmd), (int)size, sent);
+	return ok;
+}
+
+/* don't pass the socket. we may only look at it
+ * when we hold the appropriate socket mutex.
+ */
+int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
+		  enum drbd_packets cmd, struct p_header *h, size_t size)
+{
+	int ok = 0;
+	struct socket *sock;
+
+	if (use_data_socket) {
+		mutex_lock(&mdev->data.mutex);
+		sock = mdev->data.socket;
+	} else {
+		mutex_lock(&mdev->meta.mutex);
+		sock = mdev->meta.socket;
+	}
+
+	/* drbd_disconnect() could have called drbd_free_sock()
+	 * while we were waiting in down()... */
+	if (likely(sock != NULL))
+		ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
+
+	if (use_data_socket)
+		mutex_unlock(&mdev->data.mutex);
+	else
+		mutex_unlock(&mdev->meta.mutex);
+	return ok;
+}
+
+int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
+		   size_t size)
+{
+	struct p_header h;
+	int ok;
+
+	h.magic   = BE_DRBD_MAGIC;
+	h.command = cpu_to_be16(cmd);
+	h.length  = cpu_to_be16(size);
+
+	if (!drbd_get_data_sock(mdev))
+		return 0;
+
+	trace_drbd_packet(mdev, mdev->data.socket, 0, (void *)&h, __FILE__, __LINE__);
+
+	ok = (sizeof(h) ==
+		drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
+	ok = ok && (size ==
+		drbd_send(mdev, mdev->data.socket, data, size, 0));
+
+	drbd_put_data_sock(mdev);
+
+	return ok;
+}
+
+int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
+{
+	struct p_rs_param_89 *p;
+	struct socket *sock;
+	int size, rv;
+	const int apv = mdev->agreed_pro_version;
+
+	size = apv <= 87 ? sizeof(struct p_rs_param)
+		: apv == 88 ? sizeof(struct p_rs_param)
+			+ strlen(mdev->sync_conf.verify_alg) + 1
+		: /* 89 */    sizeof(struct p_rs_param_89);
+
+	/* used from admin command context and receiver/worker context.
+	 * to avoid kmalloc, grab the socket right here,
+	 * then use the pre-allocated sbuf there */
+	mutex_lock(&mdev->data.mutex);
+	sock = mdev->data.socket;
+
+	if (likely(sock != NULL)) {
+		enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
+
+		p = &mdev->data.sbuf.rs_param_89;
+
+		/* initialize verify_alg and csums_alg */
+		memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
+
+		p->rate = cpu_to_be32(sc->rate);
+
+		if (apv >= 88)
+			strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
+		if (apv >= 89)
+			strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
+
+		rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
+	} else
+		rv = 0; /* not ok */
+
+	mutex_unlock(&mdev->data.mutex);
+
+	return rv;
+}
+
+int drbd_send_protocol(struct drbd_conf *mdev)
+{
+	struct p_protocol *p;
+	int size, rv;
+
+	size = sizeof(struct p_protocol);
+
+	if (mdev->agreed_pro_version >= 87)
+		size += strlen(mdev->net_conf->integrity_alg) + 1;
+
+	/* we must not recurse into our own queue,
+	 * as that is blocked during handshake */
+	p = kmalloc(size, GFP_NOIO);
+	if (p == NULL)
+		return 0;
+
+	p->protocol      = cpu_to_be32(mdev->net_conf->wire_protocol);
+	p->after_sb_0p   = cpu_to_be32(mdev->net_conf->after_sb_0p);
+	p->after_sb_1p   = cpu_to_be32(mdev->net_conf->after_sb_1p);
+	p->after_sb_2p   = cpu_to_be32(mdev->net_conf->after_sb_2p);
+	p->want_lose     = cpu_to_be32(mdev->net_conf->want_lose);
+	p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
+
+	if (mdev->agreed_pro_version >= 87)
+		strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
+
+	rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
+			   (struct p_header *)p, size);
+	kfree(p);
+	return rv;
+}
+
+int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
+{
+	struct p_uuids p;
+	int i;
+
+	if (!get_ldev_if_state(mdev, D_NEGOTIATING))
+		return 1;
+
+	for (i = UI_CURRENT; i < UI_SIZE; i++)
+		p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
+
+	mdev->comm_bm_set = drbd_bm_total_weight(mdev);
+	p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
+	uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
+	uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
+	uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
+	p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
+
+	put_ldev(mdev);
+
+	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
+			     (struct p_header *)&p, sizeof(p));
+}
+
+int drbd_send_uuids(struct drbd_conf *mdev)
+{
+	return _drbd_send_uuids(mdev, 0);
+}
+
+int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
+{
+	return _drbd_send_uuids(mdev, 8);
+}
+
+
+int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
+{
+	struct p_rs_uuid p;
+
+	p.uuid = cpu_to_be64(val);
+
+	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
+			     (struct p_header *)&p, sizeof(p));
+}
+
+int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply)
+{
+	struct p_sizes p;
+	sector_t d_size, u_size;
+	int q_order_type;
+	int ok;
+
+	if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
+		D_ASSERT(mdev->ldev->backing_bdev);
+		d_size = drbd_get_max_capacity(mdev->ldev);
+		u_size = mdev->ldev->dc.disk_size;
+		q_order_type = drbd_queue_order_type(mdev);
+		p.queue_order_type = cpu_to_be32(drbd_queue_order_type(mdev));
+		put_ldev(mdev);
+	} else {
+		d_size = 0;
+		u_size = 0;
+		q_order_type = QUEUE_ORDERED_NONE;
+	}
+
+	p.d_size = cpu_to_be64(d_size);
+	p.u_size = cpu_to_be64(u_size);
+	p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
+	p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
+	p.queue_order_type = cpu_to_be32(q_order_type);
+
+	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
+			   (struct p_header *)&p, sizeof(p));
+	return ok;
+}
+
+/**
+ * drbd_send_state() - Sends the drbd state to the peer
+ * @mdev:	DRBD device.
+ */
+int drbd_send_state(struct drbd_conf *mdev)
+{
+	struct socket *sock;
+	struct p_state p;
+	int ok = 0;
+
+	/* Grab state lock so we wont send state if we're in the middle
+	 * of a cluster wide state change on another thread */
+	drbd_state_lock(mdev);
+
+	mutex_lock(&mdev->data.mutex);
+
+	p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
+	sock = mdev->data.socket;
+
+	if (likely(sock != NULL)) {
+		ok = _drbd_send_cmd(mdev, sock, P_STATE,
+				    (struct p_header *)&p, sizeof(p), 0);
+	}
+
+	mutex_unlock(&mdev->data.mutex);
+
+	drbd_state_unlock(mdev);
+	return ok;
+}
+
+int drbd_send_state_req(struct drbd_conf *mdev,
+	union drbd_state mask, union drbd_state val)
+{
+	struct p_req_state p;
+
+	p.mask    = cpu_to_be32(mask.i);
+	p.val     = cpu_to_be32(val.i);
+
+	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
+			     (struct p_header *)&p, sizeof(p));
+}
+
+int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
+{
+	struct p_req_state_reply p;
+
+	p.retcode    = cpu_to_be32(retcode);
+
+	return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
+			     (struct p_header *)&p, sizeof(p));
+}
+
+int fill_bitmap_rle_bits(struct drbd_conf *mdev,
+	struct p_compressed_bm *p,
+	struct bm_xfer_ctx *c)
+{
+	struct bitstream bs;
+	unsigned long plain_bits;
+	unsigned long tmp;
+	unsigned long rl;
+	unsigned len;
+	unsigned toggle;
+	int bits;
+
+	/* may we use this feature? */
+	if ((mdev->sync_conf.use_rle == 0) ||
+		(mdev->agreed_pro_version < 90))
+			return 0;
+
+	if (c->bit_offset >= c->bm_bits)
+		return 0; /* nothing to do. */
+
+	/* use at most thus many bytes */
+	bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
+	memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
+	/* plain bits covered in this code string */
+	plain_bits = 0;
+
+	/* p->encoding & 0x80 stores whether the first run length is set.
+	 * bit offset is implicit.
+	 * start with toggle == 2 to be able to tell the first iteration */
+	toggle = 2;
+
+	/* see how much plain bits we can stuff into one packet
+	 * using RLE and VLI. */
+	do {
+		tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
+				    : _drbd_bm_find_next(mdev, c->bit_offset);
+		if (tmp == -1UL)
+			tmp = c->bm_bits;
+		rl = tmp - c->bit_offset;
+
+		if (toggle == 2) { /* first iteration */
+			if (rl == 0) {
+				/* the first checked bit was set,
+				 * store start value, */
+				DCBP_set_start(p, 1);
+				/* but skip encoding of zero run length */
+				toggle = !toggle;
+				continue;
+			}
+			DCBP_set_start(p, 0);
+		}
+
+		/* paranoia: catch zero runlength.
+		 * can only happen if bitmap is modified while we scan it. */
+		if (rl == 0) {
+			dev_err(DEV, "unexpected zero runlength while encoding bitmap "
+			    "t:%u bo:%lu\n", toggle, c->bit_offset);
+			return -1;
+		}
+
+		bits = vli_encode_bits(&bs, rl);
+		if (bits == -ENOBUFS) /* buffer full */
+			break;
+		if (bits <= 0) {
+			dev_err(DEV, "error while encoding bitmap: %d\n", bits);
+			return 0;
+		}
+
+		toggle = !toggle;
+		plain_bits += rl;
+		c->bit_offset = tmp;
+	} while (c->bit_offset < c->bm_bits);
+
+	len = bs.cur.b - p->code + !!bs.cur.bit;
+
+	if (plain_bits < (len << 3)) {
+		/* incompressible with this method.
+		 * we need to rewind both word and bit position. */
+		c->bit_offset -= plain_bits;
+		bm_xfer_ctx_bit_to_word_offset(c);
+		c->bit_offset = c->word_offset * BITS_PER_LONG;
+		return 0;
+	}
+
+	/* RLE + VLI was able to compress it just fine.
+	 * update c->word_offset. */
+	bm_xfer_ctx_bit_to_word_offset(c);
+
+	/* store pad_bits */
+	DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
+
+	return len;
+}
+
+enum { OK, FAILED, DONE }
+send_bitmap_rle_or_plain(struct drbd_conf *mdev,
+	struct p_header *h, struct bm_xfer_ctx *c)
+{
+	struct p_compressed_bm *p = (void*)h;
+	unsigned long num_words;
+	int len;
+	int ok;
+
+	len = fill_bitmap_rle_bits(mdev, p, c);
+
+	if (len < 0)
+		return FAILED;
+
+	if (len) {
+		DCBP_set_code(p, RLE_VLI_Bits);
+		ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
+			sizeof(*p) + len, 0);
+
+		c->packets[0]++;
+		c->bytes[0] += sizeof(*p) + len;
+
+		if (c->bit_offset >= c->bm_bits)
+			len = 0; /* DONE */
+	} else {
+		/* was not compressible.
+		 * send a buffer full of plain text bits instead. */
+		num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
+		len = num_words * sizeof(long);
+		if (len)
+			drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
+		ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
+				   h, sizeof(struct p_header) + len, 0);
+		c->word_offset += num_words;
+		c->bit_offset = c->word_offset * BITS_PER_LONG;
+
+		c->packets[1]++;
+		c->bytes[1] += sizeof(struct p_header) + len;
+
+		if (c->bit_offset > c->bm_bits)
+			c->bit_offset = c->bm_bits;
+	}
+	ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
+
+	if (ok == DONE)
+		INFO_bm_xfer_stats(mdev, "send", c);
+	return ok;
+}
+
+/* See the comment at receive_bitmap() */
+int _drbd_send_bitmap(struct drbd_conf *mdev)
+{
+	struct bm_xfer_ctx c;
+	struct p_header *p;
+	int ret;
+
+	ERR_IF(!mdev->bitmap) return FALSE;
+
+	/* maybe we should use some per thread scratch page,
+	 * and allocate that during initial device creation? */
+	p = (struct p_header *) __get_free_page(GFP_NOIO);
+	if (!p) {
+		dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
+		return FALSE;
+	}
+
+	if (get_ldev(mdev)) {
+		if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
+			dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
+			drbd_bm_set_all(mdev);
+			if (drbd_bm_write(mdev)) {
+				/* write_bm did fail! Leave full sync flag set in Meta P_DATA
+				 * but otherwise process as per normal - need to tell other
+				 * side that a full resync is required! */
+				dev_err(DEV, "Failed to write bitmap to disk!\n");
+			} else {
+				drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
+				drbd_md_sync(mdev);
+			}
+		}
+		put_ldev(mdev);
+	}
+
+	c = (struct bm_xfer_ctx) {
+		.bm_bits = drbd_bm_bits(mdev),
+		.bm_words = drbd_bm_words(mdev),
+	};
+
+	do {
+		ret = send_bitmap_rle_or_plain(mdev, p, &c);
+	} while (ret == OK);
+
+	free_page((unsigned long) p);
+	return (ret == DONE);
+}
+
+int drbd_send_bitmap(struct drbd_conf *mdev)
+{
+	int err;
+
+	if (!drbd_get_data_sock(mdev))
+		return -1;
+	err = !_drbd_send_bitmap(mdev);
+	drbd_put_data_sock(mdev);
+	return err;
+}
+
+int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
+{
+	int ok;
+	struct p_barrier_ack p;
+
+	p.barrier  = barrier_nr;
+	p.set_size = cpu_to_be32(set_size);
+
+	if (mdev->state.conn < C_CONNECTED)
+		return FALSE;
+	ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
+			(struct p_header *)&p, sizeof(p));
+	return ok;
+}
+
+/**
+ * _drbd_send_ack() - Sends an ack packet
+ * @mdev:	DRBD device.
+ * @cmd:	Packet command code.
+ * @sector:	sector, needs to be in big endian byte order
+ * @blksize:	size in byte, needs to be in big endian byte order
+ * @block_id:	Id, big endian byte order
+ */
+static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
+			  u64 sector,
+			  u32 blksize,
+			  u64 block_id)
+{
+	int ok;
+	struct p_block_ack p;
+
+	p.sector   = sector;
+	p.block_id = block_id;
+	p.blksize  = blksize;
+	p.seq_num  = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
+
+	if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
+		return FALSE;
+	ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
+				(struct p_header *)&p, sizeof(p));
+	return ok;
+}
+
+int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
+		     struct p_data *dp)
+{
+	const int header_size = sizeof(struct p_data)
+			      - sizeof(struct p_header);
+	int data_size  = ((struct p_header *)dp)->length - header_size;
+
+	return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
+			      dp->block_id);
+}
+
+int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
+		     struct p_block_req *rp)
+{
+	return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
+}
+
+/**
+ * drbd_send_ack() - Sends an ack packet
+ * @mdev:	DRBD device.
+ * @cmd:	Packet command code.
+ * @e:		Epoch entry.
+ */
+int drbd_send_ack(struct drbd_conf *mdev,
+	enum drbd_packets cmd, struct drbd_epoch_entry *e)
+{
+	return _drbd_send_ack(mdev, cmd,
+			      cpu_to_be64(e->sector),
+			      cpu_to_be32(e->size),
+			      e->block_id);
+}
+
+/* This function misuses the block_id field to signal if the blocks
+ * are is sync or not. */
+int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
+		     sector_t sector, int blksize, u64 block_id)
+{
+	return _drbd_send_ack(mdev, cmd,
+			      cpu_to_be64(sector),
+			      cpu_to_be32(blksize),
+			      cpu_to_be64(block_id));
+}
+
+int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
+		       sector_t sector, int size, u64 block_id)
+{
+	int ok;
+	struct p_block_req p;
+
+	p.sector   = cpu_to_be64(sector);
+	p.block_id = block_id;
+	p.blksize  = cpu_to_be32(size);
+
+	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
+				(struct p_header *)&p, sizeof(p));
+	return ok;
+}
+
+int drbd_send_drequest_csum(struct drbd_conf *mdev,
+			    sector_t sector, int size,
+			    void *digest, int digest_size,
+			    enum drbd_packets cmd)
+{
+	int ok;
+	struct p_block_req p;
+
+	p.sector   = cpu_to_be64(sector);
+	p.block_id = BE_DRBD_MAGIC + 0xbeef;
+	p.blksize  = cpu_to_be32(size);
+
+	p.head.magic   = BE_DRBD_MAGIC;
+	p.head.command = cpu_to_be16(cmd);
+	p.head.length  = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size);
+
+	mutex_lock(&mdev->data.mutex);
+
+	ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
+	ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
+
+	mutex_unlock(&mdev->data.mutex);
+
+	return ok;
+}
+
+int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
+{
+	int ok;
+	struct p_block_req p;
+
+	p.sector   = cpu_to_be64(sector);
+	p.block_id = BE_DRBD_MAGIC + 0xbabe;
+	p.blksize  = cpu_to_be32(size);
+
+	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
+			   (struct p_header *)&p, sizeof(p));
+	return ok;
+}
+
+/* called on sndtimeo
+ * returns FALSE if we should retry,
+ * TRUE if we think connection is dead
+ */
+static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
+{
+	int drop_it;
+	/* long elapsed = (long)(jiffies - mdev->last_received); */
+
+	drop_it =   mdev->meta.socket == sock
+		|| !mdev->asender.task
+		|| get_t_state(&mdev->asender) != Running
+		|| mdev->state.conn < C_CONNECTED;
+
+	if (drop_it)
+		return TRUE;
+
+	drop_it = !--mdev->ko_count;
+	if (!drop_it) {
+		dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
+		       current->comm, current->pid, mdev->ko_count);
+		request_ping(mdev);
+	}
+
+	return drop_it; /* && (mdev->state == R_PRIMARY) */;
+}
+
+/* The idea of sendpage seems to be to put some kind of reference
+ * to the page into the skb, and to hand it over to the NIC. In
+ * this process get_page() gets called.
+ *
+ * As soon as the page was really sent over the network put_page()
+ * gets called by some part of the network layer. [ NIC driver? ]
+ *
+ * [ get_page() / put_page() increment/decrement the count. If count
+ *   reaches 0 the page will be freed. ]
+ *
+ * This works nicely with pages from FSs.
+ * But this means that in protocol A we might signal IO completion too early!
+ *
+ * In order not to corrupt data during a resync we must make sure
+ * that we do not reuse our own buffer pages (EEs) to early, therefore
+ * we have the net_ee list.
+ *
+ * XFS seems to have problems, still, it submits pages with page_count == 0!
+ * As a workaround, we disable sendpage on pages
+ * with page_count == 0 or PageSlab.
+ */
+static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
+		   int offset, size_t size)
+{
+	int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0);
+	kunmap(page);
+	if (sent == size)
+		mdev->send_cnt += size>>9;
+	return sent == size;
+}
+
+static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
+		    int offset, size_t size)
+{
+	mm_segment_t oldfs = get_fs();
+	int sent, ok;
+	int len = size;
+
+	/* e.g. XFS meta- & log-data is in slab pages, which have a
+	 * page_count of 0 and/or have PageSlab() set.
+	 * we cannot use send_page for those, as that does get_page();
+	 * put_page(); and would cause either a VM_BUG directly, or
+	 * __page_cache_release a page that would actually still be referenced
+	 * by someone, leading to some obscure delayed Oops somewhere else. */
+	if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
+		return _drbd_no_send_page(mdev, page, offset, size);
+
+	drbd_update_congested(mdev);
+	set_fs(KERNEL_DS);
+	do {
+		sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
+							offset, len,
+							MSG_NOSIGNAL);
+		if (sent == -EAGAIN) {
+			if (we_should_drop_the_connection(mdev,
+							  mdev->data.socket))
+				break;
+			else
+				continue;
+		}
+		if (sent <= 0) {
+			dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
+			     __func__, (int)size, len, sent);
+			break;
+		}
+		len    -= sent;
+		offset += sent;
+	} while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
+	set_fs(oldfs);
+	clear_bit(NET_CONGESTED, &mdev->flags);
+
+	ok = (len == 0);
+	if (likely(ok))
+		mdev->send_cnt += size>>9;
+	return ok;
+}
+
+static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
+{
+	struct bio_vec *bvec;
+	int i;
+	__bio_for_each_segment(bvec, bio, i, 0) {
+		if (!_drbd_no_send_page(mdev, bvec->bv_page,
+				     bvec->bv_offset, bvec->bv_len))
+			return 0;
+	}
+	return 1;
+}
+
+static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
+{
+	struct bio_vec *bvec;
+	int i;
+	__bio_for_each_segment(bvec, bio, i, 0) {
+		if (!_drbd_send_page(mdev, bvec->bv_page,
+				     bvec->bv_offset, bvec->bv_len))
+			return 0;
+	}
+
+	return 1;
+}
+
+/* Used to send write requests
+ * R_PRIMARY -> Peer	(P_DATA)
+ */
+int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
+{
+	int ok = 1;
+	struct p_data p;
+	unsigned int dp_flags = 0;
+	void *dgb;
+	int dgs;
+
+	if (!drbd_get_data_sock(mdev))
+		return 0;
+
+	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
+		crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
+
+	p.head.magic   = BE_DRBD_MAGIC;
+	p.head.command = cpu_to_be16(P_DATA);
+	p.head.length  =
+		cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size);
+
+	p.sector   = cpu_to_be64(req->sector);
+	p.block_id = (unsigned long)req;
+	p.seq_num  = cpu_to_be32(req->seq_num =
+				 atomic_add_return(1, &mdev->packet_seq));
+	dp_flags = 0;
+
+	/* NOTE: no need to check if barriers supported here as we would
+	 *       not pass the test in make_request_common in that case
+	 */
+	if (bio_rw_flagged(req->master_bio, BIO_RW_BARRIER)) {
+		dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n");
+		/* dp_flags |= DP_HARDBARRIER; */
+	}
+	if (bio_rw_flagged(req->master_bio, BIO_RW_SYNCIO))
+		dp_flags |= DP_RW_SYNC;
+	/* for now handle SYNCIO and UNPLUG
+	 * as if they still were one and the same flag */
+	if (bio_rw_flagged(req->master_bio, BIO_RW_UNPLUG))
+		dp_flags |= DP_RW_SYNC;
+	if (mdev->state.conn >= C_SYNC_SOURCE &&
+	    mdev->state.conn <= C_PAUSED_SYNC_T)
+		dp_flags |= DP_MAY_SET_IN_SYNC;
+
+	p.dp_flags = cpu_to_be32(dp_flags);
+	trace_drbd_packet(mdev, mdev->data.socket, 0, (void *)&p, __FILE__, __LINE__);
+	set_bit(UNPLUG_REMOTE, &mdev->flags);
+	ok = (sizeof(p) ==
+		drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE));
+	if (ok && dgs) {
+		dgb = mdev->int_dig_out;
+		drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
+		ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
+	}
+	if (ok) {
+		if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
+			ok = _drbd_send_bio(mdev, req->master_bio);
+		else
+			ok = _drbd_send_zc_bio(mdev, req->master_bio);
+	}
+
+	drbd_put_data_sock(mdev);
+	return ok;
+}
+
+/* answer packet, used to send data back for read requests:
+ *  Peer       -> (diskless) R_PRIMARY   (P_DATA_REPLY)
+ *  C_SYNC_SOURCE -> C_SYNC_TARGET         (P_RS_DATA_REPLY)
+ */
+int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
+		    struct drbd_epoch_entry *e)
+{
+	int ok;
+	struct p_data p;
+	void *dgb;
+	int dgs;
+
+	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
+		crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
+
+	p.head.magic   = BE_DRBD_MAGIC;
+	p.head.command = cpu_to_be16(cmd);
+	p.head.length  =
+		cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size);
+
+	p.sector   = cpu_to_be64(e->sector);
+	p.block_id = e->block_id;
+	/* p.seq_num  = 0;    No sequence numbers here.. */
+
+	/* Only called by our kernel thread.
+	 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
+	 * in response to admin command or module unload.
+	 */
+	if (!drbd_get_data_sock(mdev))
+		return 0;
+
+	trace_drbd_packet(mdev, mdev->data.socket, 0, (void *)&p, __FILE__, __LINE__);
+	ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p,
+					sizeof(p), MSG_MORE);
+	if (ok && dgs) {
+		dgb = mdev->int_dig_out;
+		drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb);
+		ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
+	}
+	if (ok)
+		ok = _drbd_send_zc_bio(mdev, e->private_bio);
+
+	drbd_put_data_sock(mdev);
+	return ok;
+}
+
+/*
+  drbd_send distinguishes two cases:
+
+  Packets sent via the data socket "sock"
+  and packets sent via the meta data socket "msock"
+
+		    sock                      msock
+  -----------------+-------------------------+------------------------------
+  timeout           conf.timeout / 2          conf.timeout / 2
+  timeout action    send a ping via msock     Abort communication
+					      and close all sockets
+*/
+
+/*
+ * you must have down()ed the appropriate [m]sock_mutex elsewhere!
+ */
+int drbd_send(struct drbd_conf *mdev, struct socket *sock,
+	      void *buf, size_t size, unsigned msg_flags)
+{
+	struct kvec iov;
+	struct msghdr msg;
+	int rv, sent = 0;
+
+	if (!sock)
+		return -1000;
+
+	/* THINK  if (signal_pending) return ... ? */
+
+	iov.iov_base = buf;
+	iov.iov_len  = size;
+
+	msg.msg_name       = NULL;
+	msg.msg_namelen    = 0;
+	msg.msg_control    = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags      = msg_flags | MSG_NOSIGNAL;
+
+	if (sock == mdev->data.socket) {
+		mdev->ko_count = mdev->net_conf->ko_count;
+		drbd_update_congested(mdev);
+	}
+	do {
+		/* STRANGE
+		 * tcp_sendmsg does _not_ use its size parameter at all ?
+		 *
+		 * -EAGAIN on timeout, -EINTR on signal.
+		 */
+/* THINK
+ * do we need to block DRBD_SIG if sock == &meta.socket ??
+ * otherwise wake_asender() might interrupt some send_*Ack !
+ */
+		rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
+		if (rv == -EAGAIN) {
+			if (we_should_drop_the_connection(mdev, sock))
+				break;
+			else
+				continue;
+		}
+		D_ASSERT(rv != 0);
+		if (rv == -EINTR) {
+			flush_signals(current);
+			rv = 0;
+		}
+		if (rv < 0)
+			break;
+		sent += rv;
+		iov.iov_base += rv;
+		iov.iov_len  -= rv;
+	} while (sent < size);
+
+	if (sock == mdev->data.socket)
+		clear_bit(NET_CONGESTED, &mdev->flags);
+
+	if (rv <= 0) {
+		if (rv != -EAGAIN) {
+			dev_err(DEV, "%s_sendmsg returned %d\n",
+			    sock == mdev->meta.socket ? "msock" : "sock",
+			    rv);
+			drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
+		} else
+			drbd_force_state(mdev, NS(conn, C_TIMEOUT));
+	}
+
+	return sent;
+}
+
+static int drbd_open(struct block_device *bdev, fmode_t mode)
+{
+	struct drbd_conf *mdev = bdev->bd_disk->private_data;
+	unsigned long flags;
+	int rv = 0;
+
+	spin_lock_irqsave(&mdev->req_lock, flags);
+	/* to have a stable mdev->state.role
+	 * and no race with updating open_cnt */
+
+	if (mdev->state.role != R_PRIMARY) {
+		if (mode & FMODE_WRITE)
+			rv = -EROFS;
+		else if (!allow_oos)
+			rv = -EMEDIUMTYPE;
+	}
+
+	if (!rv)
+		mdev->open_cnt++;
+	spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+	return rv;
+}
+
+static int drbd_release(struct gendisk *gd, fmode_t mode)
+{
+	struct drbd_conf *mdev = gd->private_data;
+	mdev->open_cnt--;
+	return 0;
+}
+
+static void drbd_unplug_fn(struct request_queue *q)
+{
+	struct drbd_conf *mdev = q->queuedata;
+
+	trace_drbd_unplug(mdev, "got unplugged");
+
+	/* unplug FIRST */
+	spin_lock_irq(q->queue_lock);
+	blk_remove_plug(q);
+	spin_unlock_irq(q->queue_lock);
+
+	/* only if connected */
+	spin_lock_irq(&mdev->req_lock);
+	if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) {
+		D_ASSERT(mdev->state.role == R_PRIMARY);
+		if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) {
+			/* add to the data.work queue,
+			 * unless already queued.
+			 * XXX this might be a good addition to drbd_queue_work
+			 * anyways, to detect "double queuing" ... */
+			if (list_empty(&mdev->unplug_work.list))
+				drbd_queue_work(&mdev->data.work,
+						&mdev->unplug_work);
+		}
+	}
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (mdev->state.disk >= D_INCONSISTENT)
+		drbd_kick_lo(mdev);
+}
+
+static void drbd_set_defaults(struct drbd_conf *mdev)
+{
+	mdev->sync_conf.after      = DRBD_AFTER_DEF;
+	mdev->sync_conf.rate       = DRBD_RATE_DEF;
+	mdev->sync_conf.al_extents = DRBD_AL_EXTENTS_DEF;
+	mdev->state = (union drbd_state) {
+		{ .role = R_SECONDARY,
+		  .peer = R_UNKNOWN,
+		  .conn = C_STANDALONE,
+		  .disk = D_DISKLESS,
+		  .pdsk = D_UNKNOWN,
+		  .susp = 0
+		} };
+}
+
+void drbd_init_set_defaults(struct drbd_conf *mdev)
+{
+	/* the memset(,0,) did most of this.
+	 * note: only assignments, no allocation in here */
+
+	drbd_set_defaults(mdev);
+
+	/* for now, we do NOT yet support it,
+	 * even though we start some framework
+	 * to eventually support barriers */
+	set_bit(NO_BARRIER_SUPP, &mdev->flags);
+
+	atomic_set(&mdev->ap_bio_cnt, 0);
+	atomic_set(&mdev->ap_pending_cnt, 0);
+	atomic_set(&mdev->rs_pending_cnt, 0);
+	atomic_set(&mdev->unacked_cnt, 0);
+	atomic_set(&mdev->local_cnt, 0);
+	atomic_set(&mdev->net_cnt, 0);
+	atomic_set(&mdev->packet_seq, 0);
+	atomic_set(&mdev->pp_in_use, 0);
+
+	mutex_init(&mdev->md_io_mutex);
+	mutex_init(&mdev->data.mutex);
+	mutex_init(&mdev->meta.mutex);
+	sema_init(&mdev->data.work.s, 0);
+	sema_init(&mdev->meta.work.s, 0);
+	mutex_init(&mdev->state_mutex);
+
+	spin_lock_init(&mdev->data.work.q_lock);
+	spin_lock_init(&mdev->meta.work.q_lock);
+
+	spin_lock_init(&mdev->al_lock);
+	spin_lock_init(&mdev->req_lock);
+	spin_lock_init(&mdev->peer_seq_lock);
+	spin_lock_init(&mdev->epoch_lock);
+
+	INIT_LIST_HEAD(&mdev->active_ee);
+	INIT_LIST_HEAD(&mdev->sync_ee);
+	INIT_LIST_HEAD(&mdev->done_ee);
+	INIT_LIST_HEAD(&mdev->read_ee);
+	INIT_LIST_HEAD(&mdev->net_ee);
+	INIT_LIST_HEAD(&mdev->resync_reads);
+	INIT_LIST_HEAD(&mdev->data.work.q);
+	INIT_LIST_HEAD(&mdev->meta.work.q);
+	INIT_LIST_HEAD(&mdev->resync_work.list);
+	INIT_LIST_HEAD(&mdev->unplug_work.list);
+	INIT_LIST_HEAD(&mdev->md_sync_work.list);
+	INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
+	mdev->resync_work.cb  = w_resync_inactive;
+	mdev->unplug_work.cb  = w_send_write_hint;
+	mdev->md_sync_work.cb = w_md_sync;
+	mdev->bm_io_work.w.cb = w_bitmap_io;
+	init_timer(&mdev->resync_timer);
+	init_timer(&mdev->md_sync_timer);
+	mdev->resync_timer.function = resync_timer_fn;
+	mdev->resync_timer.data = (unsigned long) mdev;
+	mdev->md_sync_timer.function = md_sync_timer_fn;
+	mdev->md_sync_timer.data = (unsigned long) mdev;
+
+	init_waitqueue_head(&mdev->misc_wait);
+	init_waitqueue_head(&mdev->state_wait);
+	init_waitqueue_head(&mdev->ee_wait);
+	init_waitqueue_head(&mdev->al_wait);
+	init_waitqueue_head(&mdev->seq_wait);
+
+	drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
+	drbd_thread_init(mdev, &mdev->worker, drbd_worker);
+	drbd_thread_init(mdev, &mdev->asender, drbd_asender);
+
+	mdev->agreed_pro_version = PRO_VERSION_MAX;
+	mdev->write_ordering = WO_bio_barrier;
+	mdev->resync_wenr = LC_FREE;
+}
+
+void drbd_mdev_cleanup(struct drbd_conf *mdev)
+{
+	if (mdev->receiver.t_state != None)
+		dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
+				mdev->receiver.t_state);
+
+	/* no need to lock it, I'm the only thread alive */
+	if (atomic_read(&mdev->current_epoch->epoch_size) !=  0)
+		dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
+	mdev->al_writ_cnt  =
+	mdev->bm_writ_cnt  =
+	mdev->read_cnt     =
+	mdev->recv_cnt     =
+	mdev->send_cnt     =
+	mdev->writ_cnt     =
+	mdev->p_size       =
+	mdev->rs_start     =
+	mdev->rs_total     =
+	mdev->rs_failed    =
+	mdev->rs_mark_left =
+	mdev->rs_mark_time = 0;
+	D_ASSERT(mdev->net_conf == NULL);
+
+	drbd_set_my_capacity(mdev, 0);
+	if (mdev->bitmap) {
+		/* maybe never allocated. */
+		drbd_bm_resize(mdev, 0);
+		drbd_bm_cleanup(mdev);
+	}
+
+	drbd_free_resources(mdev);
+
+	/*
+	 * currently we drbd_init_ee only on module load, so
+	 * we may do drbd_release_ee only on module unload!
+	 */
+	D_ASSERT(list_empty(&mdev->active_ee));
+	D_ASSERT(list_empty(&mdev->sync_ee));
+	D_ASSERT(list_empty(&mdev->done_ee));
+	D_ASSERT(list_empty(&mdev->read_ee));
+	D_ASSERT(list_empty(&mdev->net_ee));
+	D_ASSERT(list_empty(&mdev->resync_reads));
+	D_ASSERT(list_empty(&mdev->data.work.q));
+	D_ASSERT(list_empty(&mdev->meta.work.q));
+	D_ASSERT(list_empty(&mdev->resync_work.list));
+	D_ASSERT(list_empty(&mdev->unplug_work.list));
+
+}
+
+
+static void drbd_destroy_mempools(void)
+{
+	struct page *page;
+
+	while (drbd_pp_pool) {
+		page = drbd_pp_pool;
+		drbd_pp_pool = (struct page *)page_private(page);
+		__free_page(page);
+		drbd_pp_vacant--;
+	}
+
+	/* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
+
+	if (drbd_ee_mempool)
+		mempool_destroy(drbd_ee_mempool);
+	if (drbd_request_mempool)
+		mempool_destroy(drbd_request_mempool);
+	if (drbd_ee_cache)
+		kmem_cache_destroy(drbd_ee_cache);
+	if (drbd_request_cache)
+		kmem_cache_destroy(drbd_request_cache);
+	if (drbd_bm_ext_cache)
+		kmem_cache_destroy(drbd_bm_ext_cache);
+	if (drbd_al_ext_cache)
+		kmem_cache_destroy(drbd_al_ext_cache);
+
+	drbd_ee_mempool      = NULL;
+	drbd_request_mempool = NULL;
+	drbd_ee_cache        = NULL;
+	drbd_request_cache   = NULL;
+	drbd_bm_ext_cache    = NULL;
+	drbd_al_ext_cache    = NULL;
+
+	return;
+}
+
+static int drbd_create_mempools(void)
+{
+	struct page *page;
+	const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count;
+	int i;
+
+	/* prepare our caches and mempools */
+	drbd_request_mempool = NULL;
+	drbd_ee_cache        = NULL;
+	drbd_request_cache   = NULL;
+	drbd_bm_ext_cache    = NULL;
+	drbd_al_ext_cache    = NULL;
+	drbd_pp_pool         = NULL;
+
+	/* caches */
+	drbd_request_cache = kmem_cache_create(
+		"drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
+	if (drbd_request_cache == NULL)
+		goto Enomem;
+
+	drbd_ee_cache = kmem_cache_create(
+		"drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
+	if (drbd_ee_cache == NULL)
+		goto Enomem;
+
+	drbd_bm_ext_cache = kmem_cache_create(
+		"drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
+	if (drbd_bm_ext_cache == NULL)
+		goto Enomem;
+
+	drbd_al_ext_cache = kmem_cache_create(
+		"drbd_al", sizeof(struct lc_element), 0, 0, NULL);
+	if (drbd_al_ext_cache == NULL)
+		goto Enomem;
+
+	/* mempools */
+	drbd_request_mempool = mempool_create(number,
+		mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
+	if (drbd_request_mempool == NULL)
+		goto Enomem;
+
+	drbd_ee_mempool = mempool_create(number,
+		mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
+	if (drbd_request_mempool == NULL)
+		goto Enomem;
+
+	/* drbd's page pool */
+	spin_lock_init(&drbd_pp_lock);
+
+	for (i = 0; i < number; i++) {
+		page = alloc_page(GFP_HIGHUSER);
+		if (!page)
+			goto Enomem;
+		set_page_private(page, (unsigned long)drbd_pp_pool);
+		drbd_pp_pool = page;
+	}
+	drbd_pp_vacant = number;
+
+	return 0;
+
+Enomem:
+	drbd_destroy_mempools(); /* in case we allocated some */
+	return -ENOMEM;
+}
+
+static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
+	void *unused)
+{
+	/* just so we have it.  you never know what interesting things we
+	 * might want to do here some day...
+	 */
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block drbd_notifier = {
+	.notifier_call = drbd_notify_sys,
+};
+
+static void drbd_release_ee_lists(struct drbd_conf *mdev)
+{
+	int rr;
+
+	rr = drbd_release_ee(mdev, &mdev->active_ee);
+	if (rr)
+		dev_err(DEV, "%d EEs in active list found!\n", rr);
+
+	rr = drbd_release_ee(mdev, &mdev->sync_ee);
+	if (rr)
+		dev_err(DEV, "%d EEs in sync list found!\n", rr);
+
+	rr = drbd_release_ee(mdev, &mdev->read_ee);
+	if (rr)
+		dev_err(DEV, "%d EEs in read list found!\n", rr);
+
+	rr = drbd_release_ee(mdev, &mdev->done_ee);
+	if (rr)
+		dev_err(DEV, "%d EEs in done list found!\n", rr);
+
+	rr = drbd_release_ee(mdev, &mdev->net_ee);
+	if (rr)
+		dev_err(DEV, "%d EEs in net list found!\n", rr);
+}
+
+/* caution. no locking.
+ * currently only used from module cleanup code. */
+static void drbd_delete_device(unsigned int minor)
+{
+	struct drbd_conf *mdev = minor_to_mdev(minor);
+
+	if (!mdev)
+		return;
+
+	/* paranoia asserts */
+	if (mdev->open_cnt != 0)
+		dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
+				__FILE__ , __LINE__);
+
+	ERR_IF (!list_empty(&mdev->data.work.q)) {
+		struct list_head *lp;
+		list_for_each(lp, &mdev->data.work.q) {
+			dev_err(DEV, "lp = %p\n", lp);
+		}
+	};
+	/* end paranoia asserts */
+
+	del_gendisk(mdev->vdisk);
+
+	/* cleanup stuff that may have been allocated during
+	 * device (re-)configuration or state changes */
+
+	if (mdev->this_bdev)
+		bdput(mdev->this_bdev);
+
+	drbd_free_resources(mdev);
+
+	drbd_release_ee_lists(mdev);
+
+	/* should be free'd on disconnect? */
+	kfree(mdev->ee_hash);
+	/*
+	mdev->ee_hash_s = 0;
+	mdev->ee_hash = NULL;
+	*/
+
+	lc_destroy(mdev->act_log);
+	lc_destroy(mdev->resync);
+
+	kfree(mdev->p_uuid);
+	/* mdev->p_uuid = NULL; */
+
+	kfree(mdev->int_dig_out);
+	kfree(mdev->int_dig_in);
+	kfree(mdev->int_dig_vv);
+
+	/* cleanup the rest that has been
+	 * allocated from drbd_new_device
+	 * and actually free the mdev itself */
+	drbd_free_mdev(mdev);
+}
+
+static void drbd_cleanup(void)
+{
+	unsigned int i;
+
+	unregister_reboot_notifier(&drbd_notifier);
+
+	drbd_nl_cleanup();
+
+	if (minor_table) {
+		if (drbd_proc)
+			remove_proc_entry("drbd", NULL);
+		i = minor_count;
+		while (i--)
+			drbd_delete_device(i);
+		drbd_destroy_mempools();
+	}
+
+	kfree(minor_table);
+
+	unregister_blkdev(DRBD_MAJOR, "drbd");
+
+	printk(KERN_INFO "drbd: module cleanup done.\n");
+}
+
+/**
+ * drbd_congested() - Callback for pdflush
+ * @congested_data:	User data
+ * @bdi_bits:		Bits pdflush is currently interested in
+ *
+ * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
+ */
+static int drbd_congested(void *congested_data, int bdi_bits)
+{
+	struct drbd_conf *mdev = congested_data;
+	struct request_queue *q;
+	char reason = '-';
+	int r = 0;
+
+	if (!__inc_ap_bio_cond(mdev)) {
+		/* DRBD has frozen IO */
+		r = bdi_bits;
+		reason = 'd';
+		goto out;
+	}
+
+	if (get_ldev(mdev)) {
+		q = bdev_get_queue(mdev->ldev->backing_bdev);
+		r = bdi_congested(&q->backing_dev_info, bdi_bits);
+		put_ldev(mdev);
+		if (r)
+			reason = 'b';
+	}
+
+	if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
+		r |= (1 << BDI_async_congested);
+		reason = reason == 'b' ? 'a' : 'n';
+	}
+
+out:
+	mdev->congestion_reason = reason;
+	return r;
+}
+
+struct drbd_conf *drbd_new_device(unsigned int minor)
+{
+	struct drbd_conf *mdev;
+	struct gendisk *disk;
+	struct request_queue *q;
+
+	/* GFP_KERNEL, we are outside of all write-out paths */
+	mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
+	if (!mdev)
+		return NULL;
+	if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
+		goto out_no_cpumask;
+
+	mdev->minor = minor;
+
+	drbd_init_set_defaults(mdev);
+
+	q = blk_alloc_queue(GFP_KERNEL);
+	if (!q)
+		goto out_no_q;
+	mdev->rq_queue = q;
+	q->queuedata   = mdev;
+	blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE);
+
+	disk = alloc_disk(1);
+	if (!disk)
+		goto out_no_disk;
+	mdev->vdisk = disk;
+
+	set_disk_ro(disk, TRUE);
+
+	disk->queue = q;
+	disk->major = DRBD_MAJOR;
+	disk->first_minor = minor;
+	disk->fops = &drbd_ops;
+	sprintf(disk->disk_name, "drbd%d", minor);
+	disk->private_data = mdev;
+
+	mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
+	/* we have no partitions. we contain only ourselves. */
+	mdev->this_bdev->bd_contains = mdev->this_bdev;
+
+	q->backing_dev_info.congested_fn = drbd_congested;
+	q->backing_dev_info.congested_data = mdev;
+
+	blk_queue_make_request(q, drbd_make_request_26);
+	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
+	blk_queue_merge_bvec(q, drbd_merge_bvec);
+	q->queue_lock = &mdev->req_lock; /* needed since we use */
+		/* plugging on a queue, that actually has no requests! */
+	q->unplug_fn = drbd_unplug_fn;
+
+	mdev->md_io_page = alloc_page(GFP_KERNEL);
+	if (!mdev->md_io_page)
+		goto out_no_io_page;
+
+	if (drbd_bm_init(mdev))
+		goto out_no_bitmap;
+	/* no need to lock access, we are still initializing this minor device. */
+	if (!tl_init(mdev))
+		goto out_no_tl;
+
+	mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
+	if (!mdev->app_reads_hash)
+		goto out_no_app_reads;
+
+	mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
+	if (!mdev->current_epoch)
+		goto out_no_epoch;
+
+	INIT_LIST_HEAD(&mdev->current_epoch->list);
+	mdev->epochs = 1;
+
+	return mdev;
+
+/* out_whatever_else:
+	kfree(mdev->current_epoch); */
+out_no_epoch:
+	kfree(mdev->app_reads_hash);
+out_no_app_reads:
+	tl_cleanup(mdev);
+out_no_tl:
+	drbd_bm_cleanup(mdev);
+out_no_bitmap:
+	__free_page(mdev->md_io_page);
+out_no_io_page:
+	put_disk(disk);
+out_no_disk:
+	blk_cleanup_queue(q);
+out_no_q:
+	free_cpumask_var(mdev->cpu_mask);
+out_no_cpumask:
+	kfree(mdev);
+	return NULL;
+}
+
+/* counterpart of drbd_new_device.
+ * last part of drbd_delete_device. */
+void drbd_free_mdev(struct drbd_conf *mdev)
+{
+	kfree(mdev->current_epoch);
+	kfree(mdev->app_reads_hash);
+	tl_cleanup(mdev);
+	if (mdev->bitmap) /* should no longer be there. */
+		drbd_bm_cleanup(mdev);
+	__free_page(mdev->md_io_page);
+	put_disk(mdev->vdisk);
+	blk_cleanup_queue(mdev->rq_queue);
+	free_cpumask_var(mdev->cpu_mask);
+	kfree(mdev);
+}
+
+
+int __init drbd_init(void)
+{
+	int err;
+
+	if (sizeof(struct p_handshake) != 80) {
+		printk(KERN_ERR
+		       "drbd: never change the size or layout "
+		       "of the HandShake packet.\n");
+		return -EINVAL;
+	}
+
+	if (1 > minor_count || minor_count > 255) {
+		printk(KERN_ERR
+			"drbd: invalid minor_count (%d)\n", minor_count);
+#ifdef MODULE
+		return -EINVAL;
+#else
+		minor_count = 8;
+#endif
+	}
+
+	err = drbd_nl_init();
+	if (err)
+		return err;
+
+	err = register_blkdev(DRBD_MAJOR, "drbd");
+	if (err) {
+		printk(KERN_ERR
+		       "drbd: unable to register block device major %d\n",
+		       DRBD_MAJOR);
+		return err;
+	}
+
+	register_reboot_notifier(&drbd_notifier);
+
+	/*
+	 * allocate all necessary structs
+	 */
+	err = -ENOMEM;
+
+	init_waitqueue_head(&drbd_pp_wait);
+
+	drbd_proc = NULL; /* play safe for drbd_cleanup */
+	minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
+				GFP_KERNEL);
+	if (!minor_table)
+		goto Enomem;
+
+	err = drbd_create_mempools();
+	if (err)
+		goto Enomem;
+
+	drbd_proc = proc_create("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops);
+	if (!drbd_proc)	{
+		printk(KERN_ERR "drbd: unable to register proc file\n");
+		goto Enomem;
+	}
+
+	rwlock_init(&global_state_lock);
+
+	printk(KERN_INFO "drbd: initialized. "
+	       "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
+	       API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
+	printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
+	printk(KERN_INFO "drbd: registered as block device major %d\n",
+		DRBD_MAJOR);
+	printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
+
+	return 0; /* Success! */
+
+Enomem:
+	drbd_cleanup();
+	if (err == -ENOMEM)
+		/* currently always the case */
+		printk(KERN_ERR "drbd: ran out of memory\n");
+	else
+		printk(KERN_ERR "drbd: initialization failure\n");
+	return err;
+}
+
+void drbd_free_bc(struct drbd_backing_dev *ldev)
+{
+	if (ldev == NULL)
+		return;
+
+	bd_release(ldev->backing_bdev);
+	bd_release(ldev->md_bdev);
+
+	fput(ldev->lo_file);
+	fput(ldev->md_file);
+
+	kfree(ldev);
+}
+
+void drbd_free_sock(struct drbd_conf *mdev)
+{
+	if (mdev->data.socket) {
+		kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
+		sock_release(mdev->data.socket);
+		mdev->data.socket = NULL;
+	}
+	if (mdev->meta.socket) {
+		kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
+		sock_release(mdev->meta.socket);
+		mdev->meta.socket = NULL;
+	}
+}
+
+
+void drbd_free_resources(struct drbd_conf *mdev)
+{
+	crypto_free_hash(mdev->csums_tfm);
+	mdev->csums_tfm = NULL;
+	crypto_free_hash(mdev->verify_tfm);
+	mdev->verify_tfm = NULL;
+	crypto_free_hash(mdev->cram_hmac_tfm);
+	mdev->cram_hmac_tfm = NULL;
+	crypto_free_hash(mdev->integrity_w_tfm);
+	mdev->integrity_w_tfm = NULL;
+	crypto_free_hash(mdev->integrity_r_tfm);
+	mdev->integrity_r_tfm = NULL;
+
+	drbd_free_sock(mdev);
+
+	__no_warn(local,
+		  drbd_free_bc(mdev->ldev);
+		  mdev->ldev = NULL;);
+}
+
+/* meta data management */
+
+struct meta_data_on_disk {
+	u64 la_size;           /* last agreed size. */
+	u64 uuid[UI_SIZE];   /* UUIDs. */
+	u64 device_uuid;
+	u64 reserved_u64_1;
+	u32 flags;             /* MDF */
+	u32 magic;
+	u32 md_size_sect;
+	u32 al_offset;         /* offset to this block */
+	u32 al_nr_extents;     /* important for restoring the AL */
+	      /* `-- act_log->nr_elements <-- sync_conf.al_extents */
+	u32 bm_offset;         /* offset to the bitmap, from here */
+	u32 bm_bytes_per_bit;  /* BM_BLOCK_SIZE */
+	u32 reserved_u32[4];
+
+} __packed;
+
+/**
+ * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
+ * @mdev:	DRBD device.
+ */
+void drbd_md_sync(struct drbd_conf *mdev)
+{
+	struct meta_data_on_disk *buffer;
+	sector_t sector;
+	int i;
+
+	if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
+		return;
+	del_timer(&mdev->md_sync_timer);
+
+	/* We use here D_FAILED and not D_ATTACHING because we try to write
+	 * metadata even if we detach due to a disk failure! */
+	if (!get_ldev_if_state(mdev, D_FAILED))
+		return;
+
+	trace_drbd_md_io(mdev, WRITE, mdev->ldev);
+
+	mutex_lock(&mdev->md_io_mutex);
+	buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
+	memset(buffer, 0, 512);
+
+	buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
+	for (i = UI_CURRENT; i < UI_SIZE; i++)
+		buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
+	buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
+	buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
+
+	buffer->md_size_sect  = cpu_to_be32(mdev->ldev->md.md_size_sect);
+	buffer->al_offset     = cpu_to_be32(mdev->ldev->md.al_offset);
+	buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
+	buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
+	buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
+
+	buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
+
+	D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
+	sector = mdev->ldev->md.md_offset;
+
+	if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
+		clear_bit(MD_DIRTY, &mdev->flags);
+	} else {
+		/* this was a try anyways ... */
+		dev_err(DEV, "meta data update failed!\n");
+
+		drbd_chk_io_error(mdev, 1, TRUE);
+	}
+
+	/* Update mdev->ldev->md.la_size_sect,
+	 * since we updated it on metadata. */
+	mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
+
+	mutex_unlock(&mdev->md_io_mutex);
+	put_ldev(mdev);
+}
+
+/**
+ * drbd_md_read() - Reads in the meta data super block
+ * @mdev:	DRBD device.
+ * @bdev:	Device from which the meta data should be read in.
+ *
+ * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case
+ * something goes wrong.  Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
+ */
+int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
+{
+	struct meta_data_on_disk *buffer;
+	int i, rv = NO_ERROR;
+
+	if (!get_ldev_if_state(mdev, D_ATTACHING))
+		return ERR_IO_MD_DISK;
+
+	trace_drbd_md_io(mdev, READ, bdev);
+
+	mutex_lock(&mdev->md_io_mutex);
+	buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
+
+	if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
+		/* NOTE: cant do normal error processing here as this is
+		   called BEFORE disk is attached */
+		dev_err(DEV, "Error while reading metadata.\n");
+		rv = ERR_IO_MD_DISK;
+		goto err;
+	}
+
+	if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
+		dev_err(DEV, "Error while reading metadata, magic not found.\n");
+		rv = ERR_MD_INVALID;
+		goto err;
+	}
+	if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
+		dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
+		    be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
+		rv = ERR_MD_INVALID;
+		goto err;
+	}
+	if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
+		dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
+		    be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
+		rv = ERR_MD_INVALID;
+		goto err;
+	}
+	if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
+		dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
+		    be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
+		rv = ERR_MD_INVALID;
+		goto err;
+	}
+
+	if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
+		dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
+		    be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
+		rv = ERR_MD_INVALID;
+		goto err;
+	}
+
+	bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
+	for (i = UI_CURRENT; i < UI_SIZE; i++)
+		bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
+	bdev->md.flags = be32_to_cpu(buffer->flags);
+	mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
+	bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
+
+	if (mdev->sync_conf.al_extents < 7)
+		mdev->sync_conf.al_extents = 127;
+
+ err:
+	mutex_unlock(&mdev->md_io_mutex);
+	put_ldev(mdev);
+
+	return rv;
+}
+
+/**
+ * drbd_md_mark_dirty() - Mark meta data super block as dirty
+ * @mdev:	DRBD device.
+ *
+ * Call this function if you change anything that should be written to
+ * the meta-data super block. This function sets MD_DIRTY, and starts a
+ * timer that ensures that within five seconds you have to call drbd_md_sync().
+ */
+void drbd_md_mark_dirty(struct drbd_conf *mdev)
+{
+	set_bit(MD_DIRTY, &mdev->flags);
+	mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
+}
+
+
+static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
+{
+	int i;
+
+	for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) {
+		mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
+
+		trace_drbd_uuid(mdev, i+1);
+	}
+}
+
+void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
+{
+	if (idx == UI_CURRENT) {
+		if (mdev->state.role == R_PRIMARY)
+			val |= 1;
+		else
+			val &= ~((u64)1);
+
+		drbd_set_ed_uuid(mdev, val);
+	}
+
+	mdev->ldev->md.uuid[idx] = val;
+	trace_drbd_uuid(mdev, idx);
+	drbd_md_mark_dirty(mdev);
+}
+
+
+void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
+{
+	if (mdev->ldev->md.uuid[idx]) {
+		drbd_uuid_move_history(mdev);
+		mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
+		trace_drbd_uuid(mdev, UI_HISTORY_START);
+	}
+	_drbd_uuid_set(mdev, idx, val);
+}
+
+/**
+ * drbd_uuid_new_current() - Creates a new current UUID
+ * @mdev:	DRBD device.
+ *
+ * Creates a new current UUID, and rotates the old current UUID into
+ * the bitmap slot. Causes an incremental resync upon next connect.
+ */
+void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
+{
+	u64 val;
+
+	dev_info(DEV, "Creating new current UUID\n");
+	D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
+	mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
+	trace_drbd_uuid(mdev, UI_BITMAP);
+
+	get_random_bytes(&val, sizeof(u64));
+	_drbd_uuid_set(mdev, UI_CURRENT, val);
+}
+
+void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
+{
+	if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
+		return;
+
+	if (val == 0) {
+		drbd_uuid_move_history(mdev);
+		mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
+		mdev->ldev->md.uuid[UI_BITMAP] = 0;
+		trace_drbd_uuid(mdev, UI_HISTORY_START);
+		trace_drbd_uuid(mdev, UI_BITMAP);
+	} else {
+		if (mdev->ldev->md.uuid[UI_BITMAP])
+			dev_warn(DEV, "bm UUID already set");
+
+		mdev->ldev->md.uuid[UI_BITMAP] = val;
+		mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
+
+		trace_drbd_uuid(mdev, UI_BITMAP);
+	}
+	drbd_md_mark_dirty(mdev);
+}
+
+/**
+ * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
+ * @mdev:	DRBD device.
+ *
+ * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
+ */
+int drbd_bmio_set_n_write(struct drbd_conf *mdev)
+{
+	int rv = -EIO;
+
+	if (get_ldev_if_state(mdev, D_ATTACHING)) {
+		drbd_md_set_flag(mdev, MDF_FULL_SYNC);
+		drbd_md_sync(mdev);
+		drbd_bm_set_all(mdev);
+
+		rv = drbd_bm_write(mdev);
+
+		if (!rv) {
+			drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
+			drbd_md_sync(mdev);
+		}
+
+		put_ldev(mdev);
+	}
+
+	return rv;
+}
+
+/**
+ * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
+ * @mdev:	DRBD device.
+ *
+ * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
+ */
+int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
+{
+	int rv = -EIO;
+
+	if (get_ldev_if_state(mdev, D_ATTACHING)) {
+		drbd_bm_clear_all(mdev);
+		rv = drbd_bm_write(mdev);
+		put_ldev(mdev);
+	}
+
+	return rv;
+}
+
+static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+	struct bm_io_work *work = container_of(w, struct bm_io_work, w);
+	int rv;
+
+	D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
+
+	drbd_bm_lock(mdev, work->why);
+	rv = work->io_fn(mdev);
+	drbd_bm_unlock(mdev);
+
+	clear_bit(BITMAP_IO, &mdev->flags);
+	wake_up(&mdev->misc_wait);
+
+	if (work->done)
+		work->done(mdev, rv);
+
+	clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
+	work->why = NULL;
+
+	return 1;
+}
+
+/**
+ * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
+ * @mdev:	DRBD device.
+ * @io_fn:	IO callback to be called when bitmap IO is possible
+ * @done:	callback to be called after the bitmap IO was performed
+ * @why:	Descriptive text of the reason for doing the IO
+ *
+ * While IO on the bitmap happens we freeze application IO thus we ensure
+ * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
+ * called from worker context. It MUST NOT be used while a previous such
+ * work is still pending!
+ */
+void drbd_queue_bitmap_io(struct drbd_conf *mdev,
+			  int (*io_fn)(struct drbd_conf *),
+			  void (*done)(struct drbd_conf *, int),
+			  char *why)
+{
+	D_ASSERT(current == mdev->worker.task);
+
+	D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
+	D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
+	D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
+	if (mdev->bm_io_work.why)
+		dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
+			why, mdev->bm_io_work.why);
+
+	mdev->bm_io_work.io_fn = io_fn;
+	mdev->bm_io_work.done = done;
+	mdev->bm_io_work.why = why;
+
+	set_bit(BITMAP_IO, &mdev->flags);
+	if (atomic_read(&mdev->ap_bio_cnt) == 0) {
+		if (list_empty(&mdev->bm_io_work.w.list)) {
+			set_bit(BITMAP_IO_QUEUED, &mdev->flags);
+			drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
+		} else
+			dev_err(DEV, "FIXME avoided double queuing bm_io_work\n");
+	}
+}
+
+/**
+ * drbd_bitmap_io() -  Does an IO operation on the whole bitmap
+ * @mdev:	DRBD device.
+ * @io_fn:	IO callback to be called when bitmap IO is possible
+ * @why:	Descriptive text of the reason for doing the IO
+ *
+ * freezes application IO while that the actual IO operations runs. This
+ * functions MAY NOT be called from worker context.
+ */
+int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
+{
+	int rv;
+
+	D_ASSERT(current != mdev->worker.task);
+
+	drbd_suspend_io(mdev);
+
+	drbd_bm_lock(mdev, why);
+	rv = io_fn(mdev);
+	drbd_bm_unlock(mdev);
+
+	drbd_resume_io(mdev);
+
+	return rv;
+}
+
+void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
+{
+	if ((mdev->ldev->md.flags & flag) != flag) {
+		drbd_md_mark_dirty(mdev);
+		mdev->ldev->md.flags |= flag;
+	}
+}
+
+void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
+{
+	if ((mdev->ldev->md.flags & flag) != 0) {
+		drbd_md_mark_dirty(mdev);
+		mdev->ldev->md.flags &= ~flag;
+	}
+}
+int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
+{
+	return (bdev->md.flags & flag) != 0;
+}
+
+static void md_sync_timer_fn(unsigned long data)
+{
+	struct drbd_conf *mdev = (struct drbd_conf *) data;
+
+	drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
+}
+
+static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+	dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
+	drbd_md_sync(mdev);
+
+	return 1;
+}
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+/* Fault insertion support including random number generator shamelessly
+ * stolen from kernel/rcutorture.c */
+struct fault_random_state {
+	unsigned long state;
+	unsigned long count;
+};
+
+#define FAULT_RANDOM_MULT 39916801  /* prime */
+#define FAULT_RANDOM_ADD	479001701 /* prime */
+#define FAULT_RANDOM_REFRESH 10000
+
+/*
+ * Crude but fast random-number generator.  Uses a linear congruential
+ * generator, with occasional help from get_random_bytes().
+ */
+static unsigned long
+_drbd_fault_random(struct fault_random_state *rsp)
+{
+	long refresh;
+
+	if (--rsp->count < 0) {
+		get_random_bytes(&refresh, sizeof(refresh));
+		rsp->state += refresh;
+		rsp->count = FAULT_RANDOM_REFRESH;
+	}
+	rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
+	return swahw32(rsp->state);
+}
+
+static char *
+_drbd_fault_str(unsigned int type) {
+	static char *_faults[] = {
+		[DRBD_FAULT_MD_WR] = "Meta-data write",
+		[DRBD_FAULT_MD_RD] = "Meta-data read",
+		[DRBD_FAULT_RS_WR] = "Resync write",
+		[DRBD_FAULT_RS_RD] = "Resync read",
+		[DRBD_FAULT_DT_WR] = "Data write",
+		[DRBD_FAULT_DT_RD] = "Data read",
+		[DRBD_FAULT_DT_RA] = "Data read ahead",
+		[DRBD_FAULT_BM_ALLOC] = "BM allocation",
+		[DRBD_FAULT_AL_EE] = "EE allocation"
+	};
+
+	return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
+}
+
+unsigned int
+_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
+{
+	static struct fault_random_state rrs = {0, 0};
+
+	unsigned int ret = (
+		(fault_devs == 0 ||
+			((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
+		(((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
+
+	if (ret) {
+		fault_count++;
+
+		if (printk_ratelimit())
+			dev_warn(DEV, "***Simulating %s failure\n",
+				_drbd_fault_str(type));
+	}
+
+	return ret;
+}
+#endif
+
+const char *drbd_buildtag(void)
+{
+	/* DRBD built from external sources has here a reference to the
+	   git hash of the source code. */
+
+	static char buildtag[38] = "\0uilt-in";
+
+	if (buildtag[0] == 0) {
+#ifdef CONFIG_MODULES
+		if (THIS_MODULE != NULL)
+			sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
+		else
+#endif
+			buildtag[0] = 'b';
+	}
+
+	return buildtag;
+}
+
+module_init(drbd_init)
+module_exit(drbd_cleanup)
+
+/* For drbd_tracing: */
+EXPORT_SYMBOL(drbd_conn_str);
+EXPORT_SYMBOL(drbd_role_str);
+EXPORT_SYMBOL(drbd_disk_str);
+EXPORT_SYMBOL(drbd_set_st_err_str);
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
new file mode 100644
index 000000000000..1927acefe230
--- /dev/null
+++ b/drivers/block/drbd/drbd_nl.c
@@ -0,0 +1,2365 @@
+/*
+   drbd_nl.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/autoconf.h>
+#include <linux/module.h>
+#include <linux/drbd.h>
+#include <linux/in.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/connector.h>
+#include <linux/blkpg.h>
+#include <linux/cpumask.h>
+#include "drbd_int.h"
+#include "drbd_tracing.h"
+#include "drbd_wrappers.h"
+#include <asm/unaligned.h>
+#include <linux/drbd_tag_magic.h>
+#include <linux/drbd_limits.h>
+
+static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int);
+static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *);
+static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *);
+
+/* see get_sb_bdev and bd_claim */
+static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
+
+/* Generate the tag_list to struct functions */
+#define NL_PACKET(name, number, fields) \
+static int name ## _from_tags(struct drbd_conf *mdev, \
+	unsigned short *tags, struct name *arg) __attribute__ ((unused)); \
+static int name ## _from_tags(struct drbd_conf *mdev, \
+	unsigned short *tags, struct name *arg) \
+{ \
+	int tag; \
+	int dlen; \
+	\
+	while ((tag = get_unaligned(tags++)) != TT_END) {	\
+		dlen = get_unaligned(tags++);			\
+		switch (tag_number(tag)) { \
+		fields \
+		default: \
+			if (tag & T_MANDATORY) { \
+				dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \
+				return 0; \
+			} \
+		} \
+		tags = (unsigned short *)((char *)tags + dlen); \
+	} \
+	return 1; \
+}
+#define NL_INTEGER(pn, pr, member) \
+	case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \
+		arg->member = get_unaligned((int *)(tags));	\
+		break;
+#define NL_INT64(pn, pr, member) \
+	case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \
+		arg->member = get_unaligned((u64 *)(tags));	\
+		break;
+#define NL_BIT(pn, pr, member) \
+	case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \
+		arg->member = *(char *)(tags) ? 1 : 0; \
+		break;
+#define NL_STRING(pn, pr, member, len) \
+	case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \
+		if (dlen > len) { \
+			dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \
+				#member, dlen, (unsigned int)len); \
+			return 0; \
+		} \
+		 arg->member ## _len = dlen; \
+		 memcpy(arg->member, tags, min_t(size_t, dlen, len)); \
+		 break;
+#include "linux/drbd_nl.h"
+
+/* Generate the struct to tag_list functions */
+#define NL_PACKET(name, number, fields) \
+static unsigned short* \
+name ## _to_tags(struct drbd_conf *mdev, \
+	struct name *arg, unsigned short *tags) __attribute__ ((unused)); \
+static unsigned short* \
+name ## _to_tags(struct drbd_conf *mdev, \
+	struct name *arg, unsigned short *tags) \
+{ \
+	fields \
+	return tags; \
+}
+
+#define NL_INTEGER(pn, pr, member) \
+	put_unaligned(pn | pr | TT_INTEGER, tags++);	\
+	put_unaligned(sizeof(int), tags++);		\
+	put_unaligned(arg->member, (int *)tags);	\
+	tags = (unsigned short *)((char *)tags+sizeof(int));
+#define NL_INT64(pn, pr, member) \
+	put_unaligned(pn | pr | TT_INT64, tags++);	\
+	put_unaligned(sizeof(u64), tags++);		\
+	put_unaligned(arg->member, (u64 *)tags);	\
+	tags = (unsigned short *)((char *)tags+sizeof(u64));
+#define NL_BIT(pn, pr, member) \
+	put_unaligned(pn | pr | TT_BIT, tags++);	\
+	put_unaligned(sizeof(char), tags++);		\
+	*(char *)tags = arg->member; \
+	tags = (unsigned short *)((char *)tags+sizeof(char));
+#define NL_STRING(pn, pr, member, len) \
+	put_unaligned(pn | pr | TT_STRING, tags++);	\
+	put_unaligned(arg->member ## _len, tags++);	\
+	memcpy(tags, arg->member, arg->member ## _len); \
+	tags = (unsigned short *)((char *)tags + arg->member ## _len);
+#include "linux/drbd_nl.h"
+
+void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name);
+void drbd_nl_send_reply(struct cn_msg *, int);
+
+int drbd_khelper(struct drbd_conf *mdev, char *cmd)
+{
+	char *envp[] = { "HOME=/",
+			"TERM=linux",
+			"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+			NULL, /* Will be set to address family */
+			NULL, /* Will be set to address */
+			NULL };
+
+	char mb[12], af[20], ad[60], *afs;
+	char *argv[] = {usermode_helper, cmd, mb, NULL };
+	int ret;
+
+	snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev));
+
+	if (get_net_conf(mdev)) {
+		switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) {
+		case AF_INET6:
+			afs = "ipv6";
+			snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6",
+				 &((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr);
+			break;
+		case AF_INET:
+			afs = "ipv4";
+			snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
+				 &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
+			break;
+		default:
+			afs = "ssocks";
+			snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
+				 &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
+		}
+		snprintf(af, 20, "DRBD_PEER_AF=%s", afs);
+		envp[3]=af;
+		envp[4]=ad;
+		put_net_conf(mdev);
+	}
+
+	dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
+
+	drbd_bcast_ev_helper(mdev, cmd);
+	ret = call_usermodehelper(usermode_helper, argv, envp, 1);
+	if (ret)
+		dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
+				usermode_helper, cmd, mb,
+				(ret >> 8) & 0xff, ret);
+	else
+		dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
+				usermode_helper, cmd, mb,
+				(ret >> 8) & 0xff, ret);
+
+	if (ret < 0) /* Ignore any ERRNOs we got. */
+		ret = 0;
+
+	return ret;
+}
+
+enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
+{
+	char *ex_to_string;
+	int r;
+	enum drbd_disk_state nps;
+	enum drbd_fencing_p fp;
+
+	D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
+
+	if (get_ldev_if_state(mdev, D_CONSISTENT)) {
+		fp = mdev->ldev->dc.fencing;
+		put_ldev(mdev);
+	} else {
+		dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n");
+		return mdev->state.pdsk;
+	}
+
+	if (fp == FP_STONITH)
+		_drbd_request_state(mdev, NS(susp, 1), CS_WAIT_COMPLETE);
+
+	r = drbd_khelper(mdev, "fence-peer");
+
+	switch ((r>>8) & 0xff) {
+	case 3: /* peer is inconsistent */
+		ex_to_string = "peer is inconsistent or worse";
+		nps = D_INCONSISTENT;
+		break;
+	case 4: /* peer got outdated, or was already outdated */
+		ex_to_string = "peer was fenced";
+		nps = D_OUTDATED;
+		break;
+	case 5: /* peer was down */
+		if (mdev->state.disk == D_UP_TO_DATE) {
+			/* we will(have) create(d) a new UUID anyways... */
+			ex_to_string = "peer is unreachable, assumed to be dead";
+			nps = D_OUTDATED;
+		} else {
+			ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
+			nps = mdev->state.pdsk;
+		}
+		break;
+	case 6: /* Peer is primary, voluntarily outdate myself.
+		 * This is useful when an unconnected R_SECONDARY is asked to
+		 * become R_PRIMARY, but finds the other peer being active. */
+		ex_to_string = "peer is active";
+		dev_warn(DEV, "Peer is primary, outdating myself.\n");
+		nps = D_UNKNOWN;
+		_drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE);
+		break;
+	case 7:
+		if (fp != FP_STONITH)
+			dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n");
+		ex_to_string = "peer was stonithed";
+		nps = D_OUTDATED;
+		break;
+	default:
+		/* The script is broken ... */
+		nps = D_UNKNOWN;
+		dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
+		return nps;
+	}
+
+	dev_info(DEV, "fence-peer helper returned %d (%s)\n",
+			(r>>8) & 0xff, ex_to_string);
+	return nps;
+}
+
+
+int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
+{
+	const int max_tries = 4;
+	int r = 0;
+	int try = 0;
+	int forced = 0;
+	union drbd_state mask, val;
+	enum drbd_disk_state nps;
+
+	if (new_role == R_PRIMARY)
+		request_ping(mdev); /* Detect a dead peer ASAP */
+
+	mutex_lock(&mdev->state_mutex);
+
+	mask.i = 0; mask.role = R_MASK;
+	val.i  = 0; val.role  = new_role;
+
+	while (try++ < max_tries) {
+		r = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE);
+
+		/* in case we first succeeded to outdate,
+		 * but now suddenly could establish a connection */
+		if (r == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
+			val.pdsk = 0;
+			mask.pdsk = 0;
+			continue;
+		}
+
+		if (r == SS_NO_UP_TO_DATE_DISK && force &&
+		    (mdev->state.disk == D_INCONSISTENT ||
+		     mdev->state.disk == D_OUTDATED)) {
+			mask.disk = D_MASK;
+			val.disk  = D_UP_TO_DATE;
+			forced = 1;
+			continue;
+		}
+
+		if (r == SS_NO_UP_TO_DATE_DISK &&
+		    mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) {
+			D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
+			nps = drbd_try_outdate_peer(mdev);
+
+			if (nps == D_OUTDATED || nps == D_INCONSISTENT) {
+				val.disk = D_UP_TO_DATE;
+				mask.disk = D_MASK;
+			}
+
+			val.pdsk = nps;
+			mask.pdsk = D_MASK;
+
+			continue;
+		}
+
+		if (r == SS_NOTHING_TO_DO)
+			goto fail;
+		if (r == SS_PRIMARY_NOP && mask.pdsk == 0) {
+			nps = drbd_try_outdate_peer(mdev);
+
+			if (force && nps > D_OUTDATED) {
+				dev_warn(DEV, "Forced into split brain situation!\n");
+				nps = D_OUTDATED;
+			}
+
+			mask.pdsk = D_MASK;
+			val.pdsk  = nps;
+
+			continue;
+		}
+		if (r == SS_TWO_PRIMARIES) {
+			/* Maybe the peer is detected as dead very soon...
+			   retry at most once more in this case. */
+			__set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout((mdev->net_conf->ping_timeo+1)*HZ/10);
+			if (try < max_tries)
+				try = max_tries - 1;
+			continue;
+		}
+		if (r < SS_SUCCESS) {
+			r = _drbd_request_state(mdev, mask, val,
+						CS_VERBOSE + CS_WAIT_COMPLETE);
+			if (r < SS_SUCCESS)
+				goto fail;
+		}
+		break;
+	}
+
+	if (r < SS_SUCCESS)
+		goto fail;
+
+	if (forced)
+		dev_warn(DEV, "Forced to consider local data as UpToDate!\n");
+
+	/* Wait until nothing is on the fly :) */
+	wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0);
+
+	if (new_role == R_SECONDARY) {
+		set_disk_ro(mdev->vdisk, TRUE);
+		if (get_ldev(mdev)) {
+			mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
+			put_ldev(mdev);
+		}
+	} else {
+		if (get_net_conf(mdev)) {
+			mdev->net_conf->want_lose = 0;
+			put_net_conf(mdev);
+		}
+		set_disk_ro(mdev->vdisk, FALSE);
+		if (get_ldev(mdev)) {
+			if (((mdev->state.conn < C_CONNECTED ||
+			       mdev->state.pdsk <= D_FAILED)
+			      && mdev->ldev->md.uuid[UI_BITMAP] == 0) || forced)
+				drbd_uuid_new_current(mdev);
+
+			mdev->ldev->md.uuid[UI_CURRENT] |=  (u64)1;
+			put_ldev(mdev);
+		}
+	}
+
+	if ((new_role == R_SECONDARY) && get_ldev(mdev)) {
+		drbd_al_to_on_disk_bm(mdev);
+		put_ldev(mdev);
+	}
+
+	if (mdev->state.conn >= C_WF_REPORT_PARAMS) {
+		/* if this was forced, we should consider sync */
+		if (forced)
+			drbd_send_uuids(mdev);
+		drbd_send_state(mdev);
+	}
+
+	drbd_md_sync(mdev);
+
+	kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
+ fail:
+	mutex_unlock(&mdev->state_mutex);
+	return r;
+}
+
+
+static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			   struct drbd_nl_cfg_reply *reply)
+{
+	struct primary primary_args;
+
+	memset(&primary_args, 0, sizeof(struct primary));
+	if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) {
+		reply->ret_code = ERR_MANDATORY_TAG;
+		return 0;
+	}
+
+	reply->ret_code =
+		drbd_set_role(mdev, R_PRIMARY, primary_args.overwrite_peer);
+
+	return 0;
+}
+
+static int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			     struct drbd_nl_cfg_reply *reply)
+{
+	reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0);
+
+	return 0;
+}
+
+/* initializes the md.*_offset members, so we are able to find
+ * the on disk meta data */
+static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
+				       struct drbd_backing_dev *bdev)
+{
+	sector_t md_size_sect = 0;
+	switch (bdev->dc.meta_dev_idx) {
+	default:
+		/* v07 style fixed size indexed meta data */
+		bdev->md.md_size_sect = MD_RESERVED_SECT;
+		bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
+		bdev->md.al_offset = MD_AL_OFFSET;
+		bdev->md.bm_offset = MD_BM_OFFSET;
+		break;
+	case DRBD_MD_INDEX_FLEX_EXT:
+		/* just occupy the full device; unit: sectors */
+		bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
+		bdev->md.md_offset = 0;
+		bdev->md.al_offset = MD_AL_OFFSET;
+		bdev->md.bm_offset = MD_BM_OFFSET;
+		break;
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
+		/* al size is still fixed */
+		bdev->md.al_offset = -MD_AL_MAX_SIZE;
+		/* we need (slightly less than) ~ this much bitmap sectors: */
+		md_size_sect = drbd_get_capacity(bdev->backing_bdev);
+		md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
+		md_size_sect = BM_SECT_TO_EXT(md_size_sect);
+		md_size_sect = ALIGN(md_size_sect, 8);
+
+		/* plus the "drbd meta data super block",
+		 * and the activity log; */
+		md_size_sect += MD_BM_OFFSET;
+
+		bdev->md.md_size_sect = md_size_sect;
+		/* bitmap offset is adjusted by 'super' block size */
+		bdev->md.bm_offset   = -md_size_sect + MD_AL_OFFSET;
+		break;
+	}
+}
+
+char *ppsize(char *buf, unsigned long long size)
+{
+	/* Needs 9 bytes at max. */
+	static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' };
+	int base = 0;
+	while (size >= 10000) {
+		/* shift + round */
+		size = (size >> 10) + !!(size & (1<<9));
+		base++;
+	}
+	sprintf(buf, "%lu %cB", (long)size, units[base]);
+
+	return buf;
+}
+
+/* there is still a theoretical deadlock when called from receiver
+ * on an D_INCONSISTENT R_PRIMARY:
+ *  remote READ does inc_ap_bio, receiver would need to receive answer
+ *  packet from remote to dec_ap_bio again.
+ *  receiver receive_sizes(), comes here,
+ *  waits for ap_bio_cnt == 0. -> deadlock.
+ * but this cannot happen, actually, because:
+ *  R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable
+ *  (not connected, or bad/no disk on peer):
+ *  see drbd_fail_request_early, ap_bio_cnt is zero.
+ *  R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET:
+ *  peer may not initiate a resize.
+ */
+void drbd_suspend_io(struct drbd_conf *mdev)
+{
+	set_bit(SUSPEND_IO, &mdev->flags);
+	wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
+}
+
+void drbd_resume_io(struct drbd_conf *mdev)
+{
+	clear_bit(SUSPEND_IO, &mdev->flags);
+	wake_up(&mdev->misc_wait);
+}
+
+/**
+ * drbd_determine_dev_size() -  Sets the right device size obeying all constraints
+ * @mdev:	DRBD device.
+ *
+ * Returns 0 on success, negative return values indicate errors.
+ * You should call drbd_md_sync() after calling this function.
+ */
+enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev) __must_hold(local)
+{
+	sector_t prev_first_sect, prev_size; /* previous meta location */
+	sector_t la_size;
+	sector_t size;
+	char ppb[10];
+
+	int md_moved, la_size_changed;
+	enum determine_dev_size rv = unchanged;
+
+	/* race:
+	 * application request passes inc_ap_bio,
+	 * but then cannot get an AL-reference.
+	 * this function later may wait on ap_bio_cnt == 0. -> deadlock.
+	 *
+	 * to avoid that:
+	 * Suspend IO right here.
+	 * still lock the act_log to not trigger ASSERTs there.
+	 */
+	drbd_suspend_io(mdev);
+
+	/* no wait necessary anymore, actually we could assert that */
+	wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+
+	prev_first_sect = drbd_md_first_sector(mdev->ldev);
+	prev_size = mdev->ldev->md.md_size_sect;
+	la_size = mdev->ldev->md.la_size_sect;
+
+	/* TODO: should only be some assert here, not (re)init... */
+	drbd_md_set_sector_offsets(mdev, mdev->ldev);
+
+	size = drbd_new_dev_size(mdev, mdev->ldev);
+
+	if (drbd_get_capacity(mdev->this_bdev) != size ||
+	    drbd_bm_capacity(mdev) != size) {
+		int err;
+		err = drbd_bm_resize(mdev, size);
+		if (unlikely(err)) {
+			/* currently there is only one error: ENOMEM! */
+			size = drbd_bm_capacity(mdev)>>1;
+			if (size == 0) {
+				dev_err(DEV, "OUT OF MEMORY! "
+				    "Could not allocate bitmap!\n");
+			} else {
+				dev_err(DEV, "BM resizing failed. "
+				    "Leaving size unchanged at size = %lu KB\n",
+				    (unsigned long)size);
+			}
+			rv = dev_size_error;
+		}
+		/* racy, see comments above. */
+		drbd_set_my_capacity(mdev, size);
+		mdev->ldev->md.la_size_sect = size;
+		dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
+		     (unsigned long long)size>>1);
+	}
+	if (rv == dev_size_error)
+		goto out;
+
+	la_size_changed = (la_size != mdev->ldev->md.la_size_sect);
+
+	md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev)
+		|| prev_size	   != mdev->ldev->md.md_size_sect;
+
+	if (la_size_changed || md_moved) {
+		drbd_al_shrink(mdev); /* All extents inactive. */
+		dev_info(DEV, "Writing the whole bitmap, %s\n",
+			 la_size_changed && md_moved ? "size changed and md moved" :
+			 la_size_changed ? "size changed" : "md moved");
+		rv = drbd_bitmap_io(mdev, &drbd_bm_write, "size changed"); /* does drbd_resume_io() ! */
+		drbd_md_mark_dirty(mdev);
+	}
+
+	if (size > la_size)
+		rv = grew;
+	if (size < la_size)
+		rv = shrunk;
+out:
+	lc_unlock(mdev->act_log);
+	wake_up(&mdev->al_wait);
+	drbd_resume_io(mdev);
+
+	return rv;
+}
+
+sector_t
+drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
+{
+	sector_t p_size = mdev->p_size;   /* partner's disk size. */
+	sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */
+	sector_t m_size; /* my size */
+	sector_t u_size = bdev->dc.disk_size; /* size requested by user. */
+	sector_t size = 0;
+
+	m_size = drbd_get_max_capacity(bdev);
+
+	if (p_size && m_size) {
+		size = min_t(sector_t, p_size, m_size);
+	} else {
+		if (la_size) {
+			size = la_size;
+			if (m_size && m_size < size)
+				size = m_size;
+			if (p_size && p_size < size)
+				size = p_size;
+		} else {
+			if (m_size)
+				size = m_size;
+			if (p_size)
+				size = p_size;
+		}
+	}
+
+	if (size == 0)
+		dev_err(DEV, "Both nodes diskless!\n");
+
+	if (u_size) {
+		if (u_size > size)
+			dev_err(DEV, "Requested disk size is too big (%lu > %lu)\n",
+			    (unsigned long)u_size>>1, (unsigned long)size>>1);
+		else
+			size = u_size;
+	}
+
+	return size;
+}
+
+/**
+ * drbd_check_al_size() - Ensures that the AL is of the right size
+ * @mdev:	DRBD device.
+ *
+ * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation
+ * failed, and 0 on success. You should call drbd_md_sync() after you called
+ * this function.
+ */
+static int drbd_check_al_size(struct drbd_conf *mdev)
+{
+	struct lru_cache *n, *t;
+	struct lc_element *e;
+	unsigned int in_use;
+	int i;
+
+	ERR_IF(mdev->sync_conf.al_extents < 7)
+		mdev->sync_conf.al_extents = 127;
+
+	if (mdev->act_log &&
+	    mdev->act_log->nr_elements == mdev->sync_conf.al_extents)
+		return 0;
+
+	in_use = 0;
+	t = mdev->act_log;
+	n = lc_create("act_log", drbd_al_ext_cache,
+		mdev->sync_conf.al_extents, sizeof(struct lc_element), 0);
+
+	if (n == NULL) {
+		dev_err(DEV, "Cannot allocate act_log lru!\n");
+		return -ENOMEM;
+	}
+	spin_lock_irq(&mdev->al_lock);
+	if (t) {
+		for (i = 0; i < t->nr_elements; i++) {
+			e = lc_element_by_index(t, i);
+			if (e->refcnt)
+				dev_err(DEV, "refcnt(%d)==%d\n",
+				    e->lc_number, e->refcnt);
+			in_use += e->refcnt;
+		}
+	}
+	if (!in_use)
+		mdev->act_log = n;
+	spin_unlock_irq(&mdev->al_lock);
+	if (in_use) {
+		dev_err(DEV, "Activity log still in use!\n");
+		lc_destroy(n);
+		return -EBUSY;
+	} else {
+		if (t)
+			lc_destroy(t);
+	}
+	drbd_md_mark_dirty(mdev); /* we changed mdev->act_log->nr_elemens */
+	return 0;
+}
+
+void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __must_hold(local)
+{
+	struct request_queue * const q = mdev->rq_queue;
+	struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
+	int max_segments = mdev->ldev->dc.max_bio_bvecs;
+
+	if (b->merge_bvec_fn && !mdev->ldev->dc.use_bmbv)
+		max_seg_s = PAGE_SIZE;
+
+	max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s);
+
+	blk_queue_max_sectors(q, max_seg_s >> 9);
+	blk_queue_max_phys_segments(q, max_segments ? max_segments : MAX_PHYS_SEGMENTS);
+	blk_queue_max_hw_segments(q, max_segments ? max_segments : MAX_HW_SEGMENTS);
+	blk_queue_max_segment_size(q, max_seg_s);
+	blk_queue_logical_block_size(q, 512);
+	blk_queue_segment_boundary(q, PAGE_SIZE-1);
+	blk_stack_limits(&q->limits, &b->limits, 0);
+
+	if (b->merge_bvec_fn)
+		dev_warn(DEV, "Backing device's merge_bvec_fn() = %p\n",
+		     b->merge_bvec_fn);
+	dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q));
+
+	if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
+		dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
+		     q->backing_dev_info.ra_pages,
+		     b->backing_dev_info.ra_pages);
+		q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
+	}
+}
+
+/* serialize deconfig (worker exiting, doing cleanup)
+ * and reconfig (drbdsetup disk, drbdsetup net)
+ *
+ * wait for a potentially exiting worker, then restart it,
+ * or start a new one.
+ */
+static void drbd_reconfig_start(struct drbd_conf *mdev)
+{
+	wait_event(mdev->state_wait, test_and_set_bit(CONFIG_PENDING, &mdev->flags));
+	wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags));
+	drbd_thread_start(&mdev->worker);
+}
+
+/* if still unconfigured, stops worker again.
+ * if configured now, clears CONFIG_PENDING.
+ * wakes potential waiters */
+static void drbd_reconfig_done(struct drbd_conf *mdev)
+{
+	spin_lock_irq(&mdev->req_lock);
+	if (mdev->state.disk == D_DISKLESS &&
+	    mdev->state.conn == C_STANDALONE &&
+	    mdev->state.role == R_SECONDARY) {
+		set_bit(DEVICE_DYING, &mdev->flags);
+		drbd_thread_stop_nowait(&mdev->worker);
+	} else
+		clear_bit(CONFIG_PENDING, &mdev->flags);
+	spin_unlock_irq(&mdev->req_lock);
+	wake_up(&mdev->state_wait);
+}
+
+/* does always return 0;
+ * interesting return code is in reply->ret_code */
+static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			     struct drbd_nl_cfg_reply *reply)
+{
+	enum drbd_ret_codes retcode;
+	enum determine_dev_size dd;
+	sector_t max_possible_sectors;
+	sector_t min_md_device_sectors;
+	struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
+	struct inode *inode, *inode2;
+	struct lru_cache *resync_lru = NULL;
+	union drbd_state ns, os;
+	int rv;
+	int cp_discovered = 0;
+	int logical_block_size;
+
+	drbd_reconfig_start(mdev);
+
+	/* if you want to reconfigure, please tear down first */
+	if (mdev->state.disk > D_DISKLESS) {
+		retcode = ERR_DISK_CONFIGURED;
+		goto fail;
+	}
+
+	/* allocation not in the IO path, cqueue thread context */
+	nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
+	if (!nbc) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+
+	nbc->dc.disk_size     = DRBD_DISK_SIZE_SECT_DEF;
+	nbc->dc.on_io_error   = DRBD_ON_IO_ERROR_DEF;
+	nbc->dc.fencing       = DRBD_FENCING_DEF;
+	nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF;
+
+	if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) {
+		retcode = ERR_MANDATORY_TAG;
+		goto fail;
+	}
+
+	if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
+		retcode = ERR_MD_IDX_INVALID;
+		goto fail;
+	}
+
+	nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0);
+	if (IS_ERR(nbc->lo_file)) {
+		dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
+		    PTR_ERR(nbc->lo_file));
+		nbc->lo_file = NULL;
+		retcode = ERR_OPEN_DISK;
+		goto fail;
+	}
+
+	inode = nbc->lo_file->f_dentry->d_inode;
+
+	if (!S_ISBLK(inode->i_mode)) {
+		retcode = ERR_DISK_NOT_BDEV;
+		goto fail;
+	}
+
+	nbc->md_file = filp_open(nbc->dc.meta_dev, O_RDWR, 0);
+	if (IS_ERR(nbc->md_file)) {
+		dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
+		    PTR_ERR(nbc->md_file));
+		nbc->md_file = NULL;
+		retcode = ERR_OPEN_MD_DISK;
+		goto fail;
+	}
+
+	inode2 = nbc->md_file->f_dentry->d_inode;
+
+	if (!S_ISBLK(inode2->i_mode)) {
+		retcode = ERR_MD_NOT_BDEV;
+		goto fail;
+	}
+
+	nbc->backing_bdev = inode->i_bdev;
+	if (bd_claim(nbc->backing_bdev, mdev)) {
+		printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n",
+		       nbc->backing_bdev, mdev,
+		       nbc->backing_bdev->bd_holder,
+		       nbc->backing_bdev->bd_contains->bd_holder,
+		       nbc->backing_bdev->bd_holders);
+		retcode = ERR_BDCLAIM_DISK;
+		goto fail;
+	}
+
+	resync_lru = lc_create("resync", drbd_bm_ext_cache,
+			61, sizeof(struct bm_extent),
+			offsetof(struct bm_extent, lce));
+	if (!resync_lru) {
+		retcode = ERR_NOMEM;
+		goto release_bdev_fail;
+	}
+
+	/* meta_dev_idx >= 0: external fixed size,
+	 * possibly multiple drbd sharing one meta device.
+	 * TODO in that case, paranoia check that [md_bdev, meta_dev_idx] is
+	 * not yet used by some other drbd minor!
+	 * (if you use drbd.conf + drbdadm,
+	 * that should check it for you already; but if you don't, or someone
+	 * fooled it, we need to double check here) */
+	nbc->md_bdev = inode2->i_bdev;
+	if (bd_claim(nbc->md_bdev, (nbc->dc.meta_dev_idx < 0) ? (void *)mdev
+				: (void *) drbd_m_holder)) {
+		retcode = ERR_BDCLAIM_MD_DISK;
+		goto release_bdev_fail;
+	}
+
+	if ((nbc->backing_bdev == nbc->md_bdev) !=
+	    (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+	     nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
+		retcode = ERR_MD_IDX_INVALID;
+		goto release_bdev2_fail;
+	}
+
+	/* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
+	drbd_md_set_sector_offsets(mdev, nbc);
+
+	if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) {
+		dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
+			(unsigned long long) drbd_get_max_capacity(nbc),
+			(unsigned long long) nbc->dc.disk_size);
+		retcode = ERR_DISK_TO_SMALL;
+		goto release_bdev2_fail;
+	}
+
+	if (nbc->dc.meta_dev_idx < 0) {
+		max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
+		/* at least one MB, otherwise it does not make sense */
+		min_md_device_sectors = (2<<10);
+	} else {
+		max_possible_sectors = DRBD_MAX_SECTORS;
+		min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1);
+	}
+
+	if (drbd_get_capacity(nbc->md_bdev) > max_possible_sectors)
+		dev_warn(DEV, "truncating very big lower level device "
+		     "to currently maximum possible %llu sectors\n",
+		     (unsigned long long) max_possible_sectors);
+
+	if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
+		retcode = ERR_MD_DISK_TO_SMALL;
+		dev_warn(DEV, "refusing attach: md-device too small, "
+		     "at least %llu sectors needed for this meta-disk type\n",
+		     (unsigned long long) min_md_device_sectors);
+		goto release_bdev2_fail;
+	}
+
+	/* Make sure the new disk is big enough
+	 * (we may currently be R_PRIMARY with no local disk...) */
+	if (drbd_get_max_capacity(nbc) <
+	    drbd_get_capacity(mdev->this_bdev)) {
+		retcode = ERR_DISK_TO_SMALL;
+		goto release_bdev2_fail;
+	}
+
+	nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
+
+	drbd_suspend_io(mdev);
+	/* also wait for the last barrier ack. */
+	wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt));
+	/* and for any other previously queued work */
+	drbd_flush_workqueue(mdev);
+
+	retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE);
+	drbd_resume_io(mdev);
+	if (retcode < SS_SUCCESS)
+		goto release_bdev2_fail;
+
+	if (!get_ldev_if_state(mdev, D_ATTACHING))
+		goto force_diskless;
+
+	drbd_md_set_sector_offsets(mdev, nbc);
+
+	if (!mdev->bitmap) {
+		if (drbd_bm_init(mdev)) {
+			retcode = ERR_NOMEM;
+			goto force_diskless_dec;
+		}
+	}
+
+	retcode = drbd_md_read(mdev, nbc);
+	if (retcode != NO_ERROR)
+		goto force_diskless_dec;
+
+	if (mdev->state.conn < C_CONNECTED &&
+	    mdev->state.role == R_PRIMARY &&
+	    (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
+		dev_err(DEV, "Can only attach to data with current UUID=%016llX\n",
+		    (unsigned long long)mdev->ed_uuid);
+		retcode = ERR_DATA_NOT_CURRENT;
+		goto force_diskless_dec;
+	}
+
+	/* Since we are diskless, fix the activity log first... */
+	if (drbd_check_al_size(mdev)) {
+		retcode = ERR_NOMEM;
+		goto force_diskless_dec;
+	}
+
+	/* Prevent shrinking of consistent devices ! */
+	if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
+	   drbd_new_dev_size(mdev, nbc) < nbc->md.la_size_sect) {
+		dev_warn(DEV, "refusing to truncate a consistent device\n");
+		retcode = ERR_DISK_TO_SMALL;
+		goto force_diskless_dec;
+	}
+
+	if (!drbd_al_read_log(mdev, nbc)) {
+		retcode = ERR_IO_MD_DISK;
+		goto force_diskless_dec;
+	}
+
+	/* allocate a second IO page if logical_block_size != 512 */
+	logical_block_size = bdev_logical_block_size(nbc->md_bdev);
+	if (logical_block_size == 0)
+		logical_block_size = MD_SECTOR_SIZE;
+
+	if (logical_block_size != MD_SECTOR_SIZE) {
+		if (!mdev->md_io_tmpp) {
+			struct page *page = alloc_page(GFP_NOIO);
+			if (!page)
+				goto force_diskless_dec;
+
+			dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n",
+			     logical_block_size, MD_SECTOR_SIZE);
+			dev_warn(DEV, "Workaround engaged (has performance impact).\n");
+
+			mdev->md_io_tmpp = page;
+		}
+	}
+
+	/* Reset the "barriers don't work" bits here, then force meta data to
+	 * be written, to ensure we determine if barriers are supported. */
+	if (nbc->dc.no_md_flush)
+		set_bit(MD_NO_BARRIER, &mdev->flags);
+	else
+		clear_bit(MD_NO_BARRIER, &mdev->flags);
+
+	/* Point of no return reached.
+	 * Devices and memory are no longer released by error cleanup below.
+	 * now mdev takes over responsibility, and the state engine should
+	 * clean it up somewhere.  */
+	D_ASSERT(mdev->ldev == NULL);
+	mdev->ldev = nbc;
+	mdev->resync = resync_lru;
+	nbc = NULL;
+	resync_lru = NULL;
+
+	mdev->write_ordering = WO_bio_barrier;
+	drbd_bump_write_ordering(mdev, WO_bio_barrier);
+
+	if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY))
+		set_bit(CRASHED_PRIMARY, &mdev->flags);
+	else
+		clear_bit(CRASHED_PRIMARY, &mdev->flags);
+
+	if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND)) {
+		set_bit(CRASHED_PRIMARY, &mdev->flags);
+		cp_discovered = 1;
+	}
+
+	mdev->send_cnt = 0;
+	mdev->recv_cnt = 0;
+	mdev->read_cnt = 0;
+	mdev->writ_cnt = 0;
+
+	drbd_setup_queue_param(mdev, DRBD_MAX_SEGMENT_SIZE);
+
+	/* If I am currently not R_PRIMARY,
+	 * but meta data primary indicator is set,
+	 * I just now recover from a hard crash,
+	 * and have been R_PRIMARY before that crash.
+	 *
+	 * Now, if I had no connection before that crash
+	 * (have been degraded R_PRIMARY), chances are that
+	 * I won't find my peer now either.
+	 *
+	 * In that case, and _only_ in that case,
+	 * we use the degr-wfc-timeout instead of the default,
+	 * so we can automatically recover from a crash of a
+	 * degraded but active "cluster" after a certain timeout.
+	 */
+	clear_bit(USE_DEGR_WFC_T, &mdev->flags);
+	if (mdev->state.role != R_PRIMARY &&
+	     drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
+	    !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND))
+		set_bit(USE_DEGR_WFC_T, &mdev->flags);
+
+	dd = drbd_determin_dev_size(mdev);
+	if (dd == dev_size_error) {
+		retcode = ERR_NOMEM_BITMAP;
+		goto force_diskless_dec;
+	} else if (dd == grew)
+		set_bit(RESYNC_AFTER_NEG, &mdev->flags);
+
+	if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
+		dev_info(DEV, "Assuming that all blocks are out of sync "
+		     "(aka FullSync)\n");
+		if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from attaching")) {
+			retcode = ERR_IO_MD_DISK;
+			goto force_diskless_dec;
+		}
+	} else {
+		if (drbd_bitmap_io(mdev, &drbd_bm_read, "read from attaching") < 0) {
+			retcode = ERR_IO_MD_DISK;
+			goto force_diskless_dec;
+		}
+	}
+
+	if (cp_discovered) {
+		drbd_al_apply_to_bm(mdev);
+		drbd_al_to_on_disk_bm(mdev);
+	}
+
+	spin_lock_irq(&mdev->req_lock);
+	os = mdev->state;
+	ns.i = os.i;
+	/* If MDF_CONSISTENT is not set go into inconsistent state,
+	   otherwise investigate MDF_WasUpToDate...
+	   If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state,
+	   otherwise into D_CONSISTENT state.
+	*/
+	if (drbd_md_test_flag(mdev->ldev, MDF_CONSISTENT)) {
+		if (drbd_md_test_flag(mdev->ldev, MDF_WAS_UP_TO_DATE))
+			ns.disk = D_CONSISTENT;
+		else
+			ns.disk = D_OUTDATED;
+	} else {
+		ns.disk = D_INCONSISTENT;
+	}
+
+	if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED))
+		ns.pdsk = D_OUTDATED;
+
+	if ( ns.disk == D_CONSISTENT &&
+	    (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE))
+		ns.disk = D_UP_TO_DATE;
+
+	/* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
+	   MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before
+	   this point, because drbd_request_state() modifies these
+	   flags. */
+
+	/* In case we are C_CONNECTED postpone any decision on the new disk
+	   state after the negotiation phase. */
+	if (mdev->state.conn == C_CONNECTED) {
+		mdev->new_state_tmp.i = ns.i;
+		ns.i = os.i;
+		ns.disk = D_NEGOTIATING;
+	}
+
+	rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
+	ns = mdev->state;
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (rv < SS_SUCCESS)
+		goto force_diskless_dec;
+
+	if (mdev->state.role == R_PRIMARY)
+		mdev->ldev->md.uuid[UI_CURRENT] |=  (u64)1;
+	else
+		mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
+
+	drbd_md_mark_dirty(mdev);
+	drbd_md_sync(mdev);
+
+	kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
+	put_ldev(mdev);
+	reply->ret_code = retcode;
+	drbd_reconfig_done(mdev);
+	return 0;
+
+ force_diskless_dec:
+	put_ldev(mdev);
+ force_diskless:
+	drbd_force_state(mdev, NS(disk, D_DISKLESS));
+	drbd_md_sync(mdev);
+ release_bdev2_fail:
+	if (nbc)
+		bd_release(nbc->md_bdev);
+ release_bdev_fail:
+	if (nbc)
+		bd_release(nbc->backing_bdev);
+ fail:
+	if (nbc) {
+		if (nbc->lo_file)
+			fput(nbc->lo_file);
+		if (nbc->md_file)
+			fput(nbc->md_file);
+		kfree(nbc);
+	}
+	lc_destroy(resync_lru);
+
+	reply->ret_code = retcode;
+	drbd_reconfig_done(mdev);
+	return 0;
+}
+
+static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			  struct drbd_nl_cfg_reply *reply)
+{
+	reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS));
+	return 0;
+}
+
+static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			    struct drbd_nl_cfg_reply *reply)
+{
+	int i, ns;
+	enum drbd_ret_codes retcode;
+	struct net_conf *new_conf = NULL;
+	struct crypto_hash *tfm = NULL;
+	struct crypto_hash *integrity_w_tfm = NULL;
+	struct crypto_hash *integrity_r_tfm = NULL;
+	struct hlist_head *new_tl_hash = NULL;
+	struct hlist_head *new_ee_hash = NULL;
+	struct drbd_conf *odev;
+	char hmac_name[CRYPTO_MAX_ALG_NAME];
+	void *int_dig_out = NULL;
+	void *int_dig_in = NULL;
+	void *int_dig_vv = NULL;
+	struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr;
+
+	drbd_reconfig_start(mdev);
+
+	if (mdev->state.conn > C_STANDALONE) {
+		retcode = ERR_NET_CONFIGURED;
+		goto fail;
+	}
+
+	/* allocation not in the IO path, cqueue thread context */
+	new_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
+	if (!new_conf) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+
+	memset(new_conf, 0, sizeof(struct net_conf));
+	new_conf->timeout	   = DRBD_TIMEOUT_DEF;
+	new_conf->try_connect_int  = DRBD_CONNECT_INT_DEF;
+	new_conf->ping_int	   = DRBD_PING_INT_DEF;
+	new_conf->max_epoch_size   = DRBD_MAX_EPOCH_SIZE_DEF;
+	new_conf->max_buffers	   = DRBD_MAX_BUFFERS_DEF;
+	new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF;
+	new_conf->sndbuf_size	   = DRBD_SNDBUF_SIZE_DEF;
+	new_conf->rcvbuf_size	   = DRBD_RCVBUF_SIZE_DEF;
+	new_conf->ko_count	   = DRBD_KO_COUNT_DEF;
+	new_conf->after_sb_0p	   = DRBD_AFTER_SB_0P_DEF;
+	new_conf->after_sb_1p	   = DRBD_AFTER_SB_1P_DEF;
+	new_conf->after_sb_2p	   = DRBD_AFTER_SB_2P_DEF;
+	new_conf->want_lose	   = 0;
+	new_conf->two_primaries    = 0;
+	new_conf->wire_protocol    = DRBD_PROT_C;
+	new_conf->ping_timeo	   = DRBD_PING_TIMEO_DEF;
+	new_conf->rr_conflict	   = DRBD_RR_CONFLICT_DEF;
+
+	if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) {
+		retcode = ERR_MANDATORY_TAG;
+		goto fail;
+	}
+
+	if (new_conf->two_primaries
+	    && (new_conf->wire_protocol != DRBD_PROT_C)) {
+		retcode = ERR_NOT_PROTO_C;
+		goto fail;
+	};
+
+	if (mdev->state.role == R_PRIMARY && new_conf->want_lose) {
+		retcode = ERR_DISCARD;
+		goto fail;
+	}
+
+	retcode = NO_ERROR;
+
+	new_my_addr = (struct sockaddr *)&new_conf->my_addr;
+	new_peer_addr = (struct sockaddr *)&new_conf->peer_addr;
+	for (i = 0; i < minor_count; i++) {
+		odev = minor_to_mdev(i);
+		if (!odev || odev == mdev)
+			continue;
+		if (get_net_conf(odev)) {
+			taken_addr = (struct sockaddr *)&odev->net_conf->my_addr;
+			if (new_conf->my_addr_len == odev->net_conf->my_addr_len &&
+			    !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len))
+				retcode = ERR_LOCAL_ADDR;
+
+			taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr;
+			if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len &&
+			    !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len))
+				retcode = ERR_PEER_ADDR;
+
+			put_net_conf(odev);
+			if (retcode != NO_ERROR)
+				goto fail;
+		}
+	}
+
+	if (new_conf->cram_hmac_alg[0] != 0) {
+		snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
+			new_conf->cram_hmac_alg);
+		tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC);
+		if (IS_ERR(tfm)) {
+			tfm = NULL;
+			retcode = ERR_AUTH_ALG;
+			goto fail;
+		}
+
+		if (crypto_tfm_alg_type(crypto_hash_tfm(tfm))
+						!= CRYPTO_ALG_TYPE_HASH) {
+			retcode = ERR_AUTH_ALG_ND;
+			goto fail;
+		}
+	}
+
+	if (new_conf->integrity_alg[0]) {
+		integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
+		if (IS_ERR(integrity_w_tfm)) {
+			integrity_w_tfm = NULL;
+			retcode=ERR_INTEGRITY_ALG;
+			goto fail;
+		}
+
+		if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) {
+			retcode=ERR_INTEGRITY_ALG_ND;
+			goto fail;
+		}
+
+		integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
+		if (IS_ERR(integrity_r_tfm)) {
+			integrity_r_tfm = NULL;
+			retcode=ERR_INTEGRITY_ALG;
+			goto fail;
+		}
+	}
+
+	ns = new_conf->max_epoch_size/8;
+	if (mdev->tl_hash_s != ns) {
+		new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
+		if (!new_tl_hash) {
+			retcode = ERR_NOMEM;
+			goto fail;
+		}
+	}
+
+	ns = new_conf->max_buffers/8;
+	if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) {
+		new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
+		if (!new_ee_hash) {
+			retcode = ERR_NOMEM;
+			goto fail;
+		}
+	}
+
+	((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
+
+	if (integrity_w_tfm) {
+		i = crypto_hash_digestsize(integrity_w_tfm);
+		int_dig_out = kmalloc(i, GFP_KERNEL);
+		if (!int_dig_out) {
+			retcode = ERR_NOMEM;
+			goto fail;
+		}
+		int_dig_in = kmalloc(i, GFP_KERNEL);
+		if (!int_dig_in) {
+			retcode = ERR_NOMEM;
+			goto fail;
+		}
+		int_dig_vv = kmalloc(i, GFP_KERNEL);
+		if (!int_dig_vv) {
+			retcode = ERR_NOMEM;
+			goto fail;
+		}
+	}
+
+	if (!mdev->bitmap) {
+		if(drbd_bm_init(mdev)) {
+			retcode = ERR_NOMEM;
+			goto fail;
+		}
+	}
+
+	spin_lock_irq(&mdev->req_lock);
+	if (mdev->net_conf != NULL) {
+		retcode = ERR_NET_CONFIGURED;
+		spin_unlock_irq(&mdev->req_lock);
+		goto fail;
+	}
+	mdev->net_conf = new_conf;
+
+	mdev->send_cnt = 0;
+	mdev->recv_cnt = 0;
+
+	if (new_tl_hash) {
+		kfree(mdev->tl_hash);
+		mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8;
+		mdev->tl_hash = new_tl_hash;
+	}
+
+	if (new_ee_hash) {
+		kfree(mdev->ee_hash);
+		mdev->ee_hash_s = mdev->net_conf->max_buffers/8;
+		mdev->ee_hash = new_ee_hash;
+	}
+
+	crypto_free_hash(mdev->cram_hmac_tfm);
+	mdev->cram_hmac_tfm = tfm;
+
+	crypto_free_hash(mdev->integrity_w_tfm);
+	mdev->integrity_w_tfm = integrity_w_tfm;
+
+	crypto_free_hash(mdev->integrity_r_tfm);
+	mdev->integrity_r_tfm = integrity_r_tfm;
+
+	kfree(mdev->int_dig_out);
+	kfree(mdev->int_dig_in);
+	kfree(mdev->int_dig_vv);
+	mdev->int_dig_out=int_dig_out;
+	mdev->int_dig_in=int_dig_in;
+	mdev->int_dig_vv=int_dig_vv;
+	spin_unlock_irq(&mdev->req_lock);
+
+	retcode = _drbd_request_state(mdev, NS(conn, C_UNCONNECTED), CS_VERBOSE);
+
+	kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
+	reply->ret_code = retcode;
+	drbd_reconfig_done(mdev);
+	return 0;
+
+fail:
+	kfree(int_dig_out);
+	kfree(int_dig_in);
+	kfree(int_dig_vv);
+	crypto_free_hash(tfm);
+	crypto_free_hash(integrity_w_tfm);
+	crypto_free_hash(integrity_r_tfm);
+	kfree(new_tl_hash);
+	kfree(new_ee_hash);
+	kfree(new_conf);
+
+	reply->ret_code = retcode;
+	drbd_reconfig_done(mdev);
+	return 0;
+}
+
+static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			      struct drbd_nl_cfg_reply *reply)
+{
+	int retcode;
+
+	retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED);
+
+	if (retcode == SS_NOTHING_TO_DO)
+		goto done;
+	else if (retcode == SS_ALREADY_STANDALONE)
+		goto done;
+	else if (retcode == SS_PRIMARY_NOP) {
+		/* Our statche checking code wants to see the peer outdated. */
+		retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
+						      pdsk, D_OUTDATED));
+	} else if (retcode == SS_CW_FAILED_BY_PEER) {
+		/* The peer probably wants to see us outdated. */
+		retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
+							disk, D_OUTDATED),
+					      CS_ORDERED);
+		if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) {
+			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+			retcode = SS_SUCCESS;
+		}
+	}
+
+	if (retcode < SS_SUCCESS)
+		goto fail;
+
+	if (wait_event_interruptible(mdev->state_wait,
+				     mdev->state.conn != C_DISCONNECTING)) {
+		/* Do not test for mdev->state.conn == C_STANDALONE, since
+		   someone else might connect us in the mean time! */
+		retcode = ERR_INTR;
+		goto fail;
+	}
+
+ done:
+	retcode = NO_ERROR;
+ fail:
+	drbd_md_sync(mdev);
+	reply->ret_code = retcode;
+	return 0;
+}
+
+void resync_after_online_grow(struct drbd_conf *mdev)
+{
+	int iass; /* I am sync source */
+
+	dev_info(DEV, "Resync of new storage after online grow\n");
+	if (mdev->state.role != mdev->state.peer)
+		iass = (mdev->state.role == R_PRIMARY);
+	else
+		iass = test_bit(DISCARD_CONCURRENT, &mdev->flags);
+
+	if (iass)
+		drbd_start_resync(mdev, C_SYNC_SOURCE);
+	else
+		_drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
+}
+
+static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			  struct drbd_nl_cfg_reply *reply)
+{
+	struct resize rs;
+	int retcode = NO_ERROR;
+	int ldsc = 0; /* local disk size changed */
+	enum determine_dev_size dd;
+
+	memset(&rs, 0, sizeof(struct resize));
+	if (!resize_from_tags(mdev, nlp->tag_list, &rs)) {
+		retcode = ERR_MANDATORY_TAG;
+		goto fail;
+	}
+
+	if (mdev->state.conn > C_CONNECTED) {
+		retcode = ERR_RESIZE_RESYNC;
+		goto fail;
+	}
+
+	if (mdev->state.role == R_SECONDARY &&
+	    mdev->state.peer == R_SECONDARY) {
+		retcode = ERR_NO_PRIMARY;
+		goto fail;
+	}
+
+	if (!get_ldev(mdev)) {
+		retcode = ERR_NO_DISK;
+		goto fail;
+	}
+
+	if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
+		mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
+		ldsc = 1;
+	}
+
+	mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
+	dd = drbd_determin_dev_size(mdev);
+	drbd_md_sync(mdev);
+	put_ldev(mdev);
+	if (dd == dev_size_error) {
+		retcode = ERR_NOMEM_BITMAP;
+		goto fail;
+	}
+
+	if (mdev->state.conn == C_CONNECTED && (dd != unchanged || ldsc)) {
+		if (dd == grew)
+			set_bit(RESIZE_PENDING, &mdev->flags);
+
+		drbd_send_uuids(mdev);
+		drbd_send_sizes(mdev, 1);
+	}
+
+ fail:
+	reply->ret_code = retcode;
+	return 0;
+}
+
+static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			       struct drbd_nl_cfg_reply *reply)
+{
+	int retcode = NO_ERROR;
+	int err;
+	int ovr; /* online verify running */
+	int rsr; /* re-sync running */
+	struct crypto_hash *verify_tfm = NULL;
+	struct crypto_hash *csums_tfm = NULL;
+	struct syncer_conf sc;
+	cpumask_var_t new_cpu_mask;
+
+	if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+
+	if (nlp->flags & DRBD_NL_SET_DEFAULTS) {
+		memset(&sc, 0, sizeof(struct syncer_conf));
+		sc.rate       = DRBD_RATE_DEF;
+		sc.after      = DRBD_AFTER_DEF;
+		sc.al_extents = DRBD_AL_EXTENTS_DEF;
+	} else
+		memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
+
+	if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) {
+		retcode = ERR_MANDATORY_TAG;
+		goto fail;
+	}
+
+	/* re-sync running */
+	rsr = (	mdev->state.conn == C_SYNC_SOURCE ||
+		mdev->state.conn == C_SYNC_TARGET ||
+		mdev->state.conn == C_PAUSED_SYNC_S ||
+		mdev->state.conn == C_PAUSED_SYNC_T );
+
+	if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) {
+		retcode = ERR_CSUMS_RESYNC_RUNNING;
+		goto fail;
+	}
+
+	if (!rsr && sc.csums_alg[0]) {
+		csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC);
+		if (IS_ERR(csums_tfm)) {
+			csums_tfm = NULL;
+			retcode = ERR_CSUMS_ALG;
+			goto fail;
+		}
+
+		if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) {
+			retcode = ERR_CSUMS_ALG_ND;
+			goto fail;
+		}
+	}
+
+	/* online verify running */
+	ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T);
+
+	if (ovr) {
+		if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) {
+			retcode = ERR_VERIFY_RUNNING;
+			goto fail;
+		}
+	}
+
+	if (!ovr && sc.verify_alg[0]) {
+		verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC);
+		if (IS_ERR(verify_tfm)) {
+			verify_tfm = NULL;
+			retcode = ERR_VERIFY_ALG;
+			goto fail;
+		}
+
+		if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) {
+			retcode = ERR_VERIFY_ALG_ND;
+			goto fail;
+		}
+	}
+
+	/* silently ignore cpu mask on UP kernel */
+	if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) {
+		err = __bitmap_parse(sc.cpu_mask, 32, 0,
+				cpumask_bits(new_cpu_mask), nr_cpu_ids);
+		if (err) {
+			dev_warn(DEV, "__bitmap_parse() failed with %d\n", err);
+			retcode = ERR_CPU_MASK_PARSE;
+			goto fail;
+		}
+	}
+
+	ERR_IF (sc.rate < 1) sc.rate = 1;
+	ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */
+#define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT)
+	if (sc.al_extents > AL_MAX) {
+		dev_err(DEV, "sc.al_extents > %d\n", AL_MAX);
+		sc.al_extents = AL_MAX;
+	}
+#undef AL_MAX
+
+	/* most sanity checks done, try to assign the new sync-after
+	 * dependency.  need to hold the global lock in there,
+	 * to avoid a race in the dependency loop check. */
+	retcode = drbd_alter_sa(mdev, sc.after);
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	/* ok, assign the rest of it as well.
+	 * lock against receive_SyncParam() */
+	spin_lock(&mdev->peer_seq_lock);
+	mdev->sync_conf = sc;
+
+	if (!rsr) {
+		crypto_free_hash(mdev->csums_tfm);
+		mdev->csums_tfm = csums_tfm;
+		csums_tfm = NULL;
+	}
+
+	if (!ovr) {
+		crypto_free_hash(mdev->verify_tfm);
+		mdev->verify_tfm = verify_tfm;
+		verify_tfm = NULL;
+	}
+	spin_unlock(&mdev->peer_seq_lock);
+
+	if (get_ldev(mdev)) {
+		wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+		drbd_al_shrink(mdev);
+		err = drbd_check_al_size(mdev);
+		lc_unlock(mdev->act_log);
+		wake_up(&mdev->al_wait);
+
+		put_ldev(mdev);
+		drbd_md_sync(mdev);
+
+		if (err) {
+			retcode = ERR_NOMEM;
+			goto fail;
+		}
+	}
+
+	if (mdev->state.conn >= C_CONNECTED)
+		drbd_send_sync_param(mdev, &sc);
+
+	if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) {
+		cpumask_copy(mdev->cpu_mask, new_cpu_mask);
+		drbd_calc_cpu_mask(mdev);
+		mdev->receiver.reset_cpu_mask = 1;
+		mdev->asender.reset_cpu_mask = 1;
+		mdev->worker.reset_cpu_mask = 1;
+	}
+
+	kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
+fail:
+	free_cpumask_var(new_cpu_mask);
+	crypto_free_hash(csums_tfm);
+	crypto_free_hash(verify_tfm);
+	reply->ret_code = retcode;
+	return 0;
+}
+
+static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			      struct drbd_nl_cfg_reply *reply)
+{
+	int retcode;
+
+	retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
+
+	if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION)
+		retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
+
+	while (retcode == SS_NEED_CONNECTION) {
+		spin_lock_irq(&mdev->req_lock);
+		if (mdev->state.conn < C_CONNECTED)
+			retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL);
+		spin_unlock_irq(&mdev->req_lock);
+
+		if (retcode != SS_NEED_CONNECTION)
+			break;
+
+		retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
+	}
+
+	reply->ret_code = retcode;
+	return 0;
+}
+
+static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+				   struct drbd_nl_cfg_reply *reply)
+{
+
+	reply->ret_code = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
+
+	return 0;
+}
+
+static int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			      struct drbd_nl_cfg_reply *reply)
+{
+	int retcode = NO_ERROR;
+
+	if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
+		retcode = ERR_PAUSE_IS_SET;
+
+	reply->ret_code = retcode;
+	return 0;
+}
+
+static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			       struct drbd_nl_cfg_reply *reply)
+{
+	int retcode = NO_ERROR;
+
+	if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO)
+		retcode = ERR_PAUSE_IS_CLEAR;
+
+	reply->ret_code = retcode;
+	return 0;
+}
+
+static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			      struct drbd_nl_cfg_reply *reply)
+{
+	reply->ret_code = drbd_request_state(mdev, NS(susp, 1));
+
+	return 0;
+}
+
+static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			     struct drbd_nl_cfg_reply *reply)
+{
+	reply->ret_code = drbd_request_state(mdev, NS(susp, 0));
+	return 0;
+}
+
+static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			   struct drbd_nl_cfg_reply *reply)
+{
+	reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED));
+	return 0;
+}
+
+static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			   struct drbd_nl_cfg_reply *reply)
+{
+	unsigned short *tl;
+
+	tl = reply->tag_list;
+
+	if (get_ldev(mdev)) {
+		tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl);
+		put_ldev(mdev);
+	}
+
+	if (get_net_conf(mdev)) {
+		tl = net_conf_to_tags(mdev, mdev->net_conf, tl);
+		put_net_conf(mdev);
+	}
+	tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl);
+
+	put_unaligned(TT_END, tl++); /* Close the tag list */
+
+	return (int)((char *)tl - (char *)reply->tag_list);
+}
+
+static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			     struct drbd_nl_cfg_reply *reply)
+{
+	unsigned short *tl = reply->tag_list;
+	union drbd_state s = mdev->state;
+	unsigned long rs_left;
+	unsigned int res;
+
+	tl = get_state_to_tags(mdev, (struct get_state *)&s, tl);
+
+	/* no local ref, no bitmap, no syncer progress. */
+	if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) {
+		if (get_ldev(mdev)) {
+			drbd_get_syncer_progress(mdev, &rs_left, &res);
+			tl = tl_add_int(tl, T_sync_progress, &res);
+			put_ldev(mdev);
+		}
+	}
+	put_unaligned(TT_END, tl++); /* Close the tag list */
+
+	return (int)((char *)tl - (char *)reply->tag_list);
+}
+
+static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			     struct drbd_nl_cfg_reply *reply)
+{
+	unsigned short *tl;
+
+	tl = reply->tag_list;
+
+	if (get_ldev(mdev)) {
+		tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64));
+		tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags);
+		put_ldev(mdev);
+	}
+	put_unaligned(TT_END, tl++); /* Close the tag list */
+
+	return (int)((char *)tl - (char *)reply->tag_list);
+}
+
+/**
+ * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use
+ * @mdev:	DRBD device.
+ * @nlp:	Netlink/connector packet from drbdsetup
+ * @reply:	Reply packet for drbdsetup
+ */
+static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+				    struct drbd_nl_cfg_reply *reply)
+{
+	unsigned short *tl;
+	char rv;
+
+	tl = reply->tag_list;
+
+	rv = mdev->state.pdsk == D_OUTDATED        ? UT_PEER_OUTDATED :
+	  test_bit(USE_DEGR_WFC_T, &mdev->flags) ? UT_DEGRADED : UT_DEFAULT;
+
+	tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv));
+	put_unaligned(TT_END, tl++); /* Close the tag list */
+
+	return (int)((char *)tl - (char *)reply->tag_list);
+}
+
+static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+				    struct drbd_nl_cfg_reply *reply)
+{
+	/* default to resume from last known position, if possible */
+	struct start_ov args =
+		{ .start_sector = mdev->ov_start_sector };
+
+	if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) {
+		reply->ret_code = ERR_MANDATORY_TAG;
+		return 0;
+	}
+	/* w_make_ov_request expects position to be aligned */
+	mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT;
+	reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
+	return 0;
+}
+
+
+static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
+			      struct drbd_nl_cfg_reply *reply)
+{
+	int retcode = NO_ERROR;
+	int skip_initial_sync = 0;
+	int err;
+
+	struct new_c_uuid args;
+
+	memset(&args, 0, sizeof(struct new_c_uuid));
+	if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) {
+		reply->ret_code = ERR_MANDATORY_TAG;
+		return 0;
+	}
+
+	mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */
+
+	if (!get_ldev(mdev)) {
+		retcode = ERR_NO_DISK;
+		goto out;
+	}
+
+	/* this is "skip initial sync", assume to be clean */
+	if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 &&
+	    mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) {
+		dev_info(DEV, "Preparing to skip initial sync\n");
+		skip_initial_sync = 1;
+	} else if (mdev->state.conn != C_STANDALONE) {
+		retcode = ERR_CONNECTED;
+		goto out_dec;
+	}
+
+	drbd_uuid_set(mdev, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */
+	drbd_uuid_new_current(mdev); /* New current, previous to UI_BITMAP */
+
+	if (args.clear_bm) {
+		err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, "clear_n_write from new_c_uuid");
+		if (err) {
+			dev_err(DEV, "Writing bitmap failed with %d\n",err);
+			retcode = ERR_IO_MD_DISK;
+		}
+		if (skip_initial_sync) {
+			drbd_send_uuids_skip_initial_sync(mdev);
+			_drbd_uuid_set(mdev, UI_BITMAP, 0);
+			spin_lock_irq(&mdev->req_lock);
+			_drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
+					CS_VERBOSE, NULL);
+			spin_unlock_irq(&mdev->req_lock);
+		}
+	}
+
+	drbd_md_sync(mdev);
+out_dec:
+	put_ldev(mdev);
+out:
+	mutex_unlock(&mdev->state_mutex);
+
+	reply->ret_code = retcode;
+	return 0;
+}
+
+static struct drbd_conf *ensure_mdev(struct drbd_nl_cfg_req *nlp)
+{
+	struct drbd_conf *mdev;
+
+	if (nlp->drbd_minor >= minor_count)
+		return NULL;
+
+	mdev = minor_to_mdev(nlp->drbd_minor);
+
+	if (!mdev && (nlp->flags & DRBD_NL_CREATE_DEVICE)) {
+		struct gendisk *disk = NULL;
+		mdev = drbd_new_device(nlp->drbd_minor);
+
+		spin_lock_irq(&drbd_pp_lock);
+		if (minor_table[nlp->drbd_minor] == NULL) {
+			minor_table[nlp->drbd_minor] = mdev;
+			disk = mdev->vdisk;
+			mdev = NULL;
+		} /* else: we lost the race */
+		spin_unlock_irq(&drbd_pp_lock);
+
+		if (disk) /* we won the race above */
+			/* in case we ever add a drbd_delete_device(),
+			 * don't forget the del_gendisk! */
+			add_disk(disk);
+		else /* we lost the race above */
+			drbd_free_mdev(mdev);
+
+		mdev = minor_to_mdev(nlp->drbd_minor);
+	}
+
+	return mdev;
+}
+
+struct cn_handler_struct {
+	int (*function)(struct drbd_conf *,
+			 struct drbd_nl_cfg_req *,
+			 struct drbd_nl_cfg_reply *);
+	int reply_body_size;
+};
+
+static struct cn_handler_struct cnd_table[] = {
+	[ P_primary ]		= { &drbd_nl_primary,		0 },
+	[ P_secondary ]		= { &drbd_nl_secondary,		0 },
+	[ P_disk_conf ]		= { &drbd_nl_disk_conf,		0 },
+	[ P_detach ]		= { &drbd_nl_detach,		0 },
+	[ P_net_conf ]		= { &drbd_nl_net_conf,		0 },
+	[ P_disconnect ]	= { &drbd_nl_disconnect,	0 },
+	[ P_resize ]		= { &drbd_nl_resize,		0 },
+	[ P_syncer_conf ]	= { &drbd_nl_syncer_conf,	0 },
+	[ P_invalidate ]	= { &drbd_nl_invalidate,	0 },
+	[ P_invalidate_peer ]	= { &drbd_nl_invalidate_peer,	0 },
+	[ P_pause_sync ]	= { &drbd_nl_pause_sync,	0 },
+	[ P_resume_sync ]	= { &drbd_nl_resume_sync,	0 },
+	[ P_suspend_io ]	= { &drbd_nl_suspend_io,	0 },
+	[ P_resume_io ]		= { &drbd_nl_resume_io,		0 },
+	[ P_outdate ]		= { &drbd_nl_outdate,		0 },
+	[ P_get_config ]	= { &drbd_nl_get_config,
+				    sizeof(struct syncer_conf_tag_len_struct) +
+				    sizeof(struct disk_conf_tag_len_struct) +
+				    sizeof(struct net_conf_tag_len_struct) },
+	[ P_get_state ]		= { &drbd_nl_get_state,
+				    sizeof(struct get_state_tag_len_struct) +
+				    sizeof(struct sync_progress_tag_len_struct)	},
+	[ P_get_uuids ]		= { &drbd_nl_get_uuids,
+				    sizeof(struct get_uuids_tag_len_struct) },
+	[ P_get_timeout_flag ]	= { &drbd_nl_get_timeout_flag,
+				    sizeof(struct get_timeout_flag_tag_len_struct)},
+	[ P_start_ov ]		= { &drbd_nl_start_ov,		0 },
+	[ P_new_c_uuid ]	= { &drbd_nl_new_c_uuid,	0 },
+};
+
+static void drbd_connector_callback(struct cn_msg *req)
+{
+	struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data;
+	struct cn_handler_struct *cm;
+	struct cn_msg *cn_reply;
+	struct drbd_nl_cfg_reply *reply;
+	struct drbd_conf *mdev;
+	int retcode, rr;
+	int reply_size = sizeof(struct cn_msg)
+		+ sizeof(struct drbd_nl_cfg_reply)
+		+ sizeof(short int);
+
+	if (!try_module_get(THIS_MODULE)) {
+		printk(KERN_ERR "drbd: try_module_get() failed!\n");
+		return;
+	}
+
+	mdev = ensure_mdev(nlp);
+	if (!mdev) {
+		retcode = ERR_MINOR_INVALID;
+		goto fail;
+	}
+
+	trace_drbd_netlink(req, 1);
+
+	if (nlp->packet_type >= P_nl_after_last_packet) {
+		retcode = ERR_PACKET_NR;
+		goto fail;
+	}
+
+	cm = cnd_table + nlp->packet_type;
+
+	/* This may happen if packet number is 0: */
+	if (cm->function == NULL) {
+		retcode = ERR_PACKET_NR;
+		goto fail;
+	}
+
+	reply_size += cm->reply_body_size;
+
+	/* allocation not in the IO path, cqueue thread context */
+	cn_reply = kmalloc(reply_size, GFP_KERNEL);
+	if (!cn_reply) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+	reply = (struct drbd_nl_cfg_reply *) cn_reply->data;
+
+	reply->packet_type =
+		cm->reply_body_size ? nlp->packet_type : P_nl_after_last_packet;
+	reply->minor = nlp->drbd_minor;
+	reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */
+	/* reply->tag_list; might be modified by cm->function. */
+
+	rr = cm->function(mdev, nlp, reply);
+
+	cn_reply->id = req->id;
+	cn_reply->seq = req->seq;
+	cn_reply->ack = req->ack  + 1;
+	cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr;
+	cn_reply->flags = 0;
+
+	trace_drbd_netlink(cn_reply, 0);
+	rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL);
+	if (rr && rr != -ESRCH)
+		printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
+
+	kfree(cn_reply);
+	module_put(THIS_MODULE);
+	return;
+ fail:
+	drbd_nl_send_reply(req, retcode);
+	module_put(THIS_MODULE);
+}
+
+static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */
+
+static unsigned short *
+__tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data,
+	unsigned short len, int nul_terminated)
+{
+	unsigned short l = tag_descriptions[tag_number(tag)].max_len;
+	len = (len < l) ? len :  l;
+	put_unaligned(tag, tl++);
+	put_unaligned(len, tl++);
+	memcpy(tl, data, len);
+	tl = (unsigned short*)((char*)tl + len);
+	if (nul_terminated)
+		*((char*)tl - 1) = 0;
+	return tl;
+}
+
+static unsigned short *
+tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len)
+{
+	return __tl_add_blob(tl, tag, data, len, 0);
+}
+
+static unsigned short *
+tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str)
+{
+	return __tl_add_blob(tl, tag, str, strlen(str)+1, 0);
+}
+
+static unsigned short *
+tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val)
+{
+	put_unaligned(tag, tl++);
+	switch(tag_type(tag)) {
+	case TT_INTEGER:
+		put_unaligned(sizeof(int), tl++);
+		put_unaligned(*(int *)val, (int *)tl);
+		tl = (unsigned short*)((char*)tl+sizeof(int));
+		break;
+	case TT_INT64:
+		put_unaligned(sizeof(u64), tl++);
+		put_unaligned(*(u64 *)val, (u64 *)tl);
+		tl = (unsigned short*)((char*)tl+sizeof(u64));
+		break;
+	default:
+		/* someone did something stupid. */
+		;
+	}
+	return tl;
+}
+
+void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state)
+{
+	char buffer[sizeof(struct cn_msg)+
+		    sizeof(struct drbd_nl_cfg_reply)+
+		    sizeof(struct get_state_tag_len_struct)+
+		    sizeof(short int)];
+	struct cn_msg *cn_reply = (struct cn_msg *) buffer;
+	struct drbd_nl_cfg_reply *reply =
+		(struct drbd_nl_cfg_reply *)cn_reply->data;
+	unsigned short *tl = reply->tag_list;
+
+	/* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
+
+	tl = get_state_to_tags(mdev, (struct get_state *)&state, tl);
+
+	put_unaligned(TT_END, tl++); /* Close the tag list */
+
+	cn_reply->id.idx = CN_IDX_DRBD;
+	cn_reply->id.val = CN_VAL_DRBD;
+
+	cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
+	cn_reply->ack = 0; /* not used here. */
+	cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
+		(int)((char *)tl - (char *)reply->tag_list);
+	cn_reply->flags = 0;
+
+	reply->packet_type = P_get_state;
+	reply->minor = mdev_to_minor(mdev);
+	reply->ret_code = NO_ERROR;
+
+	trace_drbd_netlink(cn_reply, 0);
+	cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+}
+
+void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name)
+{
+	char buffer[sizeof(struct cn_msg)+
+		    sizeof(struct drbd_nl_cfg_reply)+
+		    sizeof(struct call_helper_tag_len_struct)+
+		    sizeof(short int)];
+	struct cn_msg *cn_reply = (struct cn_msg *) buffer;
+	struct drbd_nl_cfg_reply *reply =
+		(struct drbd_nl_cfg_reply *)cn_reply->data;
+	unsigned short *tl = reply->tag_list;
+
+	/* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
+
+	tl = tl_add_str(tl, T_helper, helper_name);
+	put_unaligned(TT_END, tl++); /* Close the tag list */
+
+	cn_reply->id.idx = CN_IDX_DRBD;
+	cn_reply->id.val = CN_VAL_DRBD;
+
+	cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
+	cn_reply->ack = 0; /* not used here. */
+	cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
+		(int)((char *)tl - (char *)reply->tag_list);
+	cn_reply->flags = 0;
+
+	reply->packet_type = P_call_helper;
+	reply->minor = mdev_to_minor(mdev);
+	reply->ret_code = NO_ERROR;
+
+	trace_drbd_netlink(cn_reply, 0);
+	cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+}
+
+void drbd_bcast_ee(struct drbd_conf *mdev,
+		const char *reason, const int dgs,
+		const char* seen_hash, const char* calc_hash,
+		const struct drbd_epoch_entry* e)
+{
+	struct cn_msg *cn_reply;
+	struct drbd_nl_cfg_reply *reply;
+	struct bio_vec *bvec;
+	unsigned short *tl;
+	int i;
+
+	if (!e)
+		return;
+	if (!reason || !reason[0])
+		return;
+
+	/* apparently we have to memcpy twice, first to prepare the data for the
+	 * struct cn_msg, then within cn_netlink_send from the cn_msg to the
+	 * netlink skb. */
+	/* receiver thread context, which is not in the writeout path (of this node),
+	 * but may be in the writeout path of the _other_ node.
+	 * GFP_NOIO to avoid potential "distributed deadlock". */
+	cn_reply = kmalloc(
+		sizeof(struct cn_msg)+
+		sizeof(struct drbd_nl_cfg_reply)+
+		sizeof(struct dump_ee_tag_len_struct)+
+		sizeof(short int),
+		GFP_NOIO);
+
+	if (!cn_reply) {
+		dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n",
+				(unsigned long long)e->sector, e->size);
+		return;
+	}
+
+	reply = (struct drbd_nl_cfg_reply*)cn_reply->data;
+	tl = reply->tag_list;
+
+	tl = tl_add_str(tl, T_dump_ee_reason, reason);
+	tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs);
+	tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs);
+	tl = tl_add_int(tl, T_ee_sector, &e->sector);
+	tl = tl_add_int(tl, T_ee_block_id, &e->block_id);
+
+	put_unaligned(T_ee_data, tl++);
+	put_unaligned(e->size, tl++);
+
+	__bio_for_each_segment(bvec, e->private_bio, i, 0) {
+		void *d = kmap(bvec->bv_page);
+		memcpy(tl, d + bvec->bv_offset, bvec->bv_len);
+		kunmap(bvec->bv_page);
+		tl=(unsigned short*)((char*)tl + bvec->bv_len);
+	}
+	put_unaligned(TT_END, tl++); /* Close the tag list */
+
+	cn_reply->id.idx = CN_IDX_DRBD;
+	cn_reply->id.val = CN_VAL_DRBD;
+
+	cn_reply->seq = atomic_add_return(1,&drbd_nl_seq);
+	cn_reply->ack = 0; // not used here.
+	cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
+		(int)((char*)tl - (char*)reply->tag_list);
+	cn_reply->flags = 0;
+
+	reply->packet_type = P_dump_ee;
+	reply->minor = mdev_to_minor(mdev);
+	reply->ret_code = NO_ERROR;
+
+	trace_drbd_netlink(cn_reply, 0);
+	cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+	kfree(cn_reply);
+}
+
+void drbd_bcast_sync_progress(struct drbd_conf *mdev)
+{
+	char buffer[sizeof(struct cn_msg)+
+		    sizeof(struct drbd_nl_cfg_reply)+
+		    sizeof(struct sync_progress_tag_len_struct)+
+		    sizeof(short int)];
+	struct cn_msg *cn_reply = (struct cn_msg *) buffer;
+	struct drbd_nl_cfg_reply *reply =
+		(struct drbd_nl_cfg_reply *)cn_reply->data;
+	unsigned short *tl = reply->tag_list;
+	unsigned long rs_left;
+	unsigned int res;
+
+	/* no local ref, no bitmap, no syncer progress, no broadcast. */
+	if (!get_ldev(mdev))
+		return;
+	drbd_get_syncer_progress(mdev, &rs_left, &res);
+	put_ldev(mdev);
+
+	tl = tl_add_int(tl, T_sync_progress, &res);
+	put_unaligned(TT_END, tl++); /* Close the tag list */
+
+	cn_reply->id.idx = CN_IDX_DRBD;
+	cn_reply->id.val = CN_VAL_DRBD;
+
+	cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
+	cn_reply->ack = 0; /* not used here. */
+	cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
+		(int)((char *)tl - (char *)reply->tag_list);
+	cn_reply->flags = 0;
+
+	reply->packet_type = P_sync_progress;
+	reply->minor = mdev_to_minor(mdev);
+	reply->ret_code = NO_ERROR;
+
+	trace_drbd_netlink(cn_reply, 0);
+	cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+}
+
+int __init drbd_nl_init(void)
+{
+	static struct cb_id cn_id_drbd;
+	int err, try=10;
+
+	cn_id_drbd.val = CN_VAL_DRBD;
+	do {
+		cn_id_drbd.idx = cn_idx;
+		err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback);
+		if (!err)
+			break;
+		cn_idx = (cn_idx + CN_IDX_STEP);
+	} while (try--);
+
+	if (err) {
+		printk(KERN_ERR "drbd: cn_drbd failed to register\n");
+		return err;
+	}
+
+	return 0;
+}
+
+void drbd_nl_cleanup(void)
+{
+	static struct cb_id cn_id_drbd;
+
+	cn_id_drbd.idx = cn_idx;
+	cn_id_drbd.val = CN_VAL_DRBD;
+
+	cn_del_callback(&cn_id_drbd);
+}
+
+void drbd_nl_send_reply(struct cn_msg *req, int ret_code)
+{
+	char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)];
+	struct cn_msg *cn_reply = (struct cn_msg *) buffer;
+	struct drbd_nl_cfg_reply *reply =
+		(struct drbd_nl_cfg_reply *)cn_reply->data;
+	int rr;
+
+	cn_reply->id = req->id;
+
+	cn_reply->seq = req->seq;
+	cn_reply->ack = req->ack  + 1;
+	cn_reply->len = sizeof(struct drbd_nl_cfg_reply);
+	cn_reply->flags = 0;
+
+	reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor;
+	reply->ret_code = ret_code;
+
+	trace_drbd_netlink(cn_reply, 0);
+	rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
+	if (rr && rr != -ESRCH)
+		printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
+}
+
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
new file mode 100644
index 000000000000..98fcb7450c76
--- /dev/null
+++ b/drivers/block/drbd/drbd_proc.c
@@ -0,0 +1,266 @@
+/*
+   drbd_proc.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/autoconf.h>
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+
+static int drbd_proc_open(struct inode *inode, struct file *file);
+
+
+struct proc_dir_entry *drbd_proc;
+struct file_operations drbd_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= drbd_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+
+/*lge
+ * progress bars shamelessly adapted from driver/md/md.c
+ * output looks like
+ *	[=====>..............] 33.5% (23456/123456)
+ *	finish: 2:20:20 speed: 6,345 (6,456) K/sec
+ */
+static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
+{
+	unsigned long db, dt, dbdt, rt, rs_left;
+	unsigned int res;
+	int i, x, y;
+
+	drbd_get_syncer_progress(mdev, &rs_left, &res);
+
+	x = res/50;
+	y = 20-x;
+	seq_printf(seq, "\t[");
+	for (i = 1; i < x; i++)
+		seq_printf(seq, "=");
+	seq_printf(seq, ">");
+	for (i = 0; i < y; i++)
+		seq_printf(seq, ".");
+	seq_printf(seq, "] ");
+
+	seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10);
+	/* if more than 1 GB display in MB */
+	if (mdev->rs_total > 0x100000L)
+		seq_printf(seq, "(%lu/%lu)M\n\t",
+			    (unsigned long) Bit2KB(rs_left >> 10),
+			    (unsigned long) Bit2KB(mdev->rs_total >> 10));
+	else
+		seq_printf(seq, "(%lu/%lu)K\n\t",
+			    (unsigned long) Bit2KB(rs_left),
+			    (unsigned long) Bit2KB(mdev->rs_total));
+
+	/* see drivers/md/md.c
+	 * We do not want to overflow, so the order of operands and
+	 * the * 100 / 100 trick are important. We do a +1 to be
+	 * safe against division by zero. We only estimate anyway.
+	 *
+	 * dt: time from mark until now
+	 * db: blocks written from mark until now
+	 * rt: remaining time
+	 */
+	dt = (jiffies - mdev->rs_mark_time) / HZ;
+
+	if (dt > 20) {
+		/* if we made no update to rs_mark_time for too long,
+		 * we are stalled. show that. */
+		seq_printf(seq, "stalled\n");
+		return;
+	}
+
+	if (!dt)
+		dt++;
+	db = mdev->rs_mark_left - rs_left;
+	rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */
+
+	seq_printf(seq, "finish: %lu:%02lu:%02lu",
+		rt / 3600, (rt % 3600) / 60, rt % 60);
+
+	/* current speed average over (SYNC_MARKS * SYNC_MARK_STEP) jiffies */
+	dbdt = Bit2KB(db/dt);
+	if (dbdt > 1000)
+		seq_printf(seq, " speed: %ld,%03ld",
+			dbdt/1000, dbdt % 1000);
+	else
+		seq_printf(seq, " speed: %ld", dbdt);
+
+	/* mean speed since syncer started
+	 * we do account for PausedSync periods */
+	dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
+	if (dt <= 0)
+		dt = 1;
+	db = mdev->rs_total - rs_left;
+	dbdt = Bit2KB(db/dt);
+	if (dbdt > 1000)
+		seq_printf(seq, " (%ld,%03ld)",
+			dbdt/1000, dbdt % 1000);
+	else
+		seq_printf(seq, " (%ld)", dbdt);
+
+	seq_printf(seq, " K/sec\n");
+}
+
+static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
+{
+	struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
+
+	seq_printf(seq, "%5d %s %s\n", bme->rs_left,
+		   bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------",
+		   bme->flags & BME_LOCKED ? "LOCKED" : "------"
+		   );
+}
+
+static int drbd_seq_show(struct seq_file *seq, void *v)
+{
+	int i, hole = 0;
+	const char *sn;
+	struct drbd_conf *mdev;
+
+	static char write_ordering_chars[] = {
+		[WO_none] = 'n',
+		[WO_drain_io] = 'd',
+		[WO_bdev_flush] = 'f',
+		[WO_bio_barrier] = 'b',
+	};
+
+	seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
+		   API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag());
+
+	/*
+	  cs .. connection state
+	  ro .. node role (local/remote)
+	  ds .. disk state (local/remote)
+	     protocol
+	     various flags
+	  ns .. network send
+	  nr .. network receive
+	  dw .. disk write
+	  dr .. disk read
+	  al .. activity log write count
+	  bm .. bitmap update write count
+	  pe .. pending (waiting for ack or data reply)
+	  ua .. unack'd (still need to send ack or data reply)
+	  ap .. application requests accepted, but not yet completed
+	  ep .. number of epochs currently "on the fly", P_BARRIER_ACK pending
+	  wo .. write ordering mode currently in use
+	 oos .. known out-of-sync kB
+	*/
+
+	for (i = 0; i < minor_count; i++) {
+		mdev = minor_to_mdev(i);
+		if (!mdev) {
+			hole = 1;
+			continue;
+		}
+		if (hole) {
+			hole = 0;
+			seq_printf(seq, "\n");
+		}
+
+		sn = drbd_conn_str(mdev->state.conn);
+
+		if (mdev->state.conn == C_STANDALONE &&
+		    mdev->state.disk == D_DISKLESS &&
+		    mdev->state.role == R_SECONDARY) {
+			seq_printf(seq, "%2d: cs:Unconfigured\n", i);
+		} else {
+			seq_printf(seq,
+			   "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c\n"
+			   "    ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
+			   "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
+			   i, sn,
+			   drbd_role_str(mdev->state.role),
+			   drbd_role_str(mdev->state.peer),
+			   drbd_disk_str(mdev->state.disk),
+			   drbd_disk_str(mdev->state.pdsk),
+			   (mdev->net_conf == NULL ? ' ' :
+			    (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')),
+			   mdev->state.susp ? 's' : 'r',
+			   mdev->state.aftr_isp ? 'a' : '-',
+			   mdev->state.peer_isp ? 'p' : '-',
+			   mdev->state.user_isp ? 'u' : '-',
+			   mdev->congestion_reason ?: '-',
+			   mdev->send_cnt/2,
+			   mdev->recv_cnt/2,
+			   mdev->writ_cnt/2,
+			   mdev->read_cnt/2,
+			   mdev->al_writ_cnt,
+			   mdev->bm_writ_cnt,
+			   atomic_read(&mdev->local_cnt),
+			   atomic_read(&mdev->ap_pending_cnt) +
+			   atomic_read(&mdev->rs_pending_cnt),
+			   atomic_read(&mdev->unacked_cnt),
+			   atomic_read(&mdev->ap_bio_cnt),
+			   mdev->epochs,
+			   write_ordering_chars[mdev->write_ordering]
+			);
+			seq_printf(seq, " oos:%lu\n",
+				   Bit2KB(drbd_bm_total_weight(mdev)));
+		}
+		if (mdev->state.conn == C_SYNC_SOURCE ||
+		    mdev->state.conn == C_SYNC_TARGET)
+			drbd_syncer_progress(mdev, seq);
+
+		if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
+			seq_printf(seq, "\t%3d%%      %lu/%lu\n",
+				   (int)((mdev->rs_total-mdev->ov_left) /
+					 (mdev->rs_total/100+1)),
+				   mdev->rs_total - mdev->ov_left,
+				   mdev->rs_total);
+
+		if (proc_details >= 1 && get_ldev_if_state(mdev, D_FAILED)) {
+			lc_seq_printf_stats(seq, mdev->resync);
+			lc_seq_printf_stats(seq, mdev->act_log);
+			put_ldev(mdev);
+		}
+
+		if (proc_details >= 2) {
+			if (mdev->resync) {
+				lc_seq_dump_details(seq, mdev->resync, "rs_left",
+					resync_dump_detail);
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int drbd_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, drbd_seq_show, PDE(inode)->data);
+}
+
+/* PROC FS stuff end */
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
new file mode 100644
index 000000000000..63686c4d85cf
--- /dev/null
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -0,0 +1,4456 @@
+/*
+   drbd_receiver.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#include <linux/autoconf.h>
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <net/sock.h>
+
+#include <linux/version.h>
+#include <linux/drbd.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/pkt_sched.h>
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+#include <linux/vmalloc.h>
+#include <linux/random.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/scatterlist.h>
+#include "drbd_int.h"
+#include "drbd_tracing.h"
+#include "drbd_req.h"
+
+#include "drbd_vli.h"
+
+struct flush_work {
+	struct drbd_work w;
+	struct drbd_epoch *epoch;
+};
+
+enum finish_epoch {
+	FE_STILL_LIVE,
+	FE_DESTROYED,
+	FE_RECYCLED,
+};
+
+static int drbd_do_handshake(struct drbd_conf *mdev);
+static int drbd_do_auth(struct drbd_conf *mdev);
+
+static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
+static int e_end_block(struct drbd_conf *, struct drbd_work *, int);
+
+static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
+{
+	struct drbd_epoch *prev;
+	spin_lock(&mdev->epoch_lock);
+	prev = list_entry(epoch->list.prev, struct drbd_epoch, list);
+	if (prev == epoch || prev == mdev->current_epoch)
+		prev = NULL;
+	spin_unlock(&mdev->epoch_lock);
+	return prev;
+}
+
+#define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)
+
+static struct page *drbd_pp_first_page_or_try_alloc(struct drbd_conf *mdev)
+{
+	struct page *page = NULL;
+
+	/* Yes, testing drbd_pp_vacant outside the lock is racy.
+	 * So what. It saves a spin_lock. */
+	if (drbd_pp_vacant > 0) {
+		spin_lock(&drbd_pp_lock);
+		page = drbd_pp_pool;
+		if (page) {
+			drbd_pp_pool = (struct page *)page_private(page);
+			set_page_private(page, 0); /* just to be polite */
+			drbd_pp_vacant--;
+		}
+		spin_unlock(&drbd_pp_lock);
+	}
+	/* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
+	 * "criss-cross" setup, that might cause write-out on some other DRBD,
+	 * which in turn might block on the other node at this very place.  */
+	if (!page)
+		page = alloc_page(GFP_TRY);
+	if (page)
+		atomic_inc(&mdev->pp_in_use);
+	return page;
+}
+
+/* kick lower level device, if we have more than (arbitrary number)
+ * reference counts on it, which typically are locally submitted io
+ * requests.  don't use unacked_cnt, so we speed up proto A and B, too. */
+static void maybe_kick_lo(struct drbd_conf *mdev)
+{
+	if (atomic_read(&mdev->local_cnt) >= mdev->net_conf->unplug_watermark)
+		drbd_kick_lo(mdev);
+}
+
+static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed)
+{
+	struct drbd_epoch_entry *e;
+	struct list_head *le, *tle;
+
+	/* The EEs are always appended to the end of the list. Since
+	   they are sent in order over the wire, they have to finish
+	   in order. As soon as we see the first not finished we can
+	   stop to examine the list... */
+
+	list_for_each_safe(le, tle, &mdev->net_ee) {
+		e = list_entry(le, struct drbd_epoch_entry, w.list);
+		if (drbd_bio_has_active_page(e->private_bio))
+			break;
+		list_move(le, to_be_freed);
+	}
+}
+
+static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
+{
+	LIST_HEAD(reclaimed);
+	struct drbd_epoch_entry *e, *t;
+
+	maybe_kick_lo(mdev);
+	spin_lock_irq(&mdev->req_lock);
+	reclaim_net_ee(mdev, &reclaimed);
+	spin_unlock_irq(&mdev->req_lock);
+
+	list_for_each_entry_safe(e, t, &reclaimed, w.list)
+		drbd_free_ee(mdev, e);
+}
+
+/**
+ * drbd_pp_alloc() - Returns a page, fails only if a signal comes in
+ * @mdev:	DRBD device.
+ * @retry:	whether or not to retry allocation forever (or until signalled)
+ *
+ * Tries to allocate a page, first from our own page pool, then from the
+ * kernel, unless this allocation would exceed the max_buffers setting.
+ * If @retry is non-zero, retry until DRBD frees a page somewhere else.
+ */
+static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry)
+{
+	struct page *page = NULL;
+	DEFINE_WAIT(wait);
+
+	if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
+		page = drbd_pp_first_page_or_try_alloc(mdev);
+		if (page)
+			return page;
+	}
+
+	for (;;) {
+		prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
+
+		drbd_kick_lo_and_reclaim_net(mdev);
+
+		if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
+			page = drbd_pp_first_page_or_try_alloc(mdev);
+			if (page)
+				break;
+		}
+
+		if (!retry)
+			break;
+
+		if (signal_pending(current)) {
+			dev_warn(DEV, "drbd_pp_alloc interrupted!\n");
+			break;
+		}
+
+		schedule();
+	}
+	finish_wait(&drbd_pp_wait, &wait);
+
+	return page;
+}
+
+/* Must not be used from irq, as that may deadlock: see drbd_pp_alloc.
+ * Is also used from inside an other spin_lock_irq(&mdev->req_lock) */
+static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
+{
+	int free_it;
+
+	spin_lock(&drbd_pp_lock);
+	if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) {
+		free_it = 1;
+	} else {
+		set_page_private(page, (unsigned long)drbd_pp_pool);
+		drbd_pp_pool = page;
+		drbd_pp_vacant++;
+		free_it = 0;
+	}
+	spin_unlock(&drbd_pp_lock);
+
+	atomic_dec(&mdev->pp_in_use);
+
+	if (free_it)
+		__free_page(page);
+
+	wake_up(&drbd_pp_wait);
+}
+
+static void drbd_pp_free_bio_pages(struct drbd_conf *mdev, struct bio *bio)
+{
+	struct page *p_to_be_freed = NULL;
+	struct page *page;
+	struct bio_vec *bvec;
+	int i;
+
+	spin_lock(&drbd_pp_lock);
+	__bio_for_each_segment(bvec, bio, i, 0) {
+		if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) {
+			set_page_private(bvec->bv_page, (unsigned long)p_to_be_freed);
+			p_to_be_freed = bvec->bv_page;
+		} else {
+			set_page_private(bvec->bv_page, (unsigned long)drbd_pp_pool);
+			drbd_pp_pool = bvec->bv_page;
+			drbd_pp_vacant++;
+		}
+	}
+	spin_unlock(&drbd_pp_lock);
+	atomic_sub(bio->bi_vcnt, &mdev->pp_in_use);
+
+	while (p_to_be_freed) {
+		page = p_to_be_freed;
+		p_to_be_freed = (struct page *)page_private(page);
+		set_page_private(page, 0); /* just to be polite */
+		put_page(page);
+	}
+
+	wake_up(&drbd_pp_wait);
+}
+
+/*
+You need to hold the req_lock:
+ _drbd_wait_ee_list_empty()
+
+You must not have the req_lock:
+ drbd_free_ee()
+ drbd_alloc_ee()
+ drbd_init_ee()
+ drbd_release_ee()
+ drbd_ee_fix_bhs()
+ drbd_process_done_ee()
+ drbd_clear_done_ee()
+ drbd_wait_ee_list_empty()
+*/
+
+struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
+				     u64 id,
+				     sector_t sector,
+				     unsigned int data_size,
+				     gfp_t gfp_mask) __must_hold(local)
+{
+	struct request_queue *q;
+	struct drbd_epoch_entry *e;
+	struct page *page;
+	struct bio *bio;
+	unsigned int ds;
+
+	if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE))
+		return NULL;
+
+	e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
+	if (!e) {
+		if (!(gfp_mask & __GFP_NOWARN))
+			dev_err(DEV, "alloc_ee: Allocation of an EE failed\n");
+		return NULL;
+	}
+
+	bio = bio_alloc(gfp_mask & ~__GFP_HIGHMEM, div_ceil(data_size, PAGE_SIZE));
+	if (!bio) {
+		if (!(gfp_mask & __GFP_NOWARN))
+			dev_err(DEV, "alloc_ee: Allocation of a bio failed\n");
+		goto fail1;
+	}
+
+	bio->bi_bdev = mdev->ldev->backing_bdev;
+	bio->bi_sector = sector;
+
+	ds = data_size;
+	while (ds) {
+		page = drbd_pp_alloc(mdev, (gfp_mask & __GFP_WAIT));
+		if (!page) {
+			if (!(gfp_mask & __GFP_NOWARN))
+				dev_err(DEV, "alloc_ee: Allocation of a page failed\n");
+			goto fail2;
+		}
+		if (!bio_add_page(bio, page, min_t(int, ds, PAGE_SIZE), 0)) {
+			drbd_pp_free(mdev, page);
+			dev_err(DEV, "alloc_ee: bio_add_page(s=%llu,"
+			    "data_size=%u,ds=%u) failed\n",
+			    (unsigned long long)sector, data_size, ds);
+
+			q = bdev_get_queue(bio->bi_bdev);
+			if (q->merge_bvec_fn) {
+				struct bvec_merge_data bvm = {
+					.bi_bdev = bio->bi_bdev,
+					.bi_sector = bio->bi_sector,
+					.bi_size = bio->bi_size,
+					.bi_rw = bio->bi_rw,
+				};
+				int l = q->merge_bvec_fn(q, &bvm,
+						&bio->bi_io_vec[bio->bi_vcnt]);
+				dev_err(DEV, "merge_bvec_fn() = %d\n", l);
+			}
+
+			/* dump more of the bio. */
+			dev_err(DEV, "bio->bi_max_vecs = %d\n", bio->bi_max_vecs);
+			dev_err(DEV, "bio->bi_vcnt = %d\n", bio->bi_vcnt);
+			dev_err(DEV, "bio->bi_size = %d\n", bio->bi_size);
+			dev_err(DEV, "bio->bi_phys_segments = %d\n", bio->bi_phys_segments);
+
+			goto fail2;
+			break;
+		}
+		ds -= min_t(int, ds, PAGE_SIZE);
+	}
+
+	D_ASSERT(data_size == bio->bi_size);
+
+	bio->bi_private = e;
+	e->mdev = mdev;
+	e->sector = sector;
+	e->size = bio->bi_size;
+
+	e->private_bio = bio;
+	e->block_id = id;
+	INIT_HLIST_NODE(&e->colision);
+	e->epoch = NULL;
+	e->flags = 0;
+
+	trace_drbd_ee(mdev, e, "allocated");
+
+	return e;
+
+ fail2:
+	drbd_pp_free_bio_pages(mdev, bio);
+	bio_put(bio);
+ fail1:
+	mempool_free(e, drbd_ee_mempool);
+
+	return NULL;
+}
+
+void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
+{
+	struct bio *bio = e->private_bio;
+	trace_drbd_ee(mdev, e, "freed");
+	drbd_pp_free_bio_pages(mdev, bio);
+	bio_put(bio);
+	D_ASSERT(hlist_unhashed(&e->colision));
+	mempool_free(e, drbd_ee_mempool);
+}
+
+int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
+{
+	LIST_HEAD(work_list);
+	struct drbd_epoch_entry *e, *t;
+	int count = 0;
+
+	spin_lock_irq(&mdev->req_lock);
+	list_splice_init(list, &work_list);
+	spin_unlock_irq(&mdev->req_lock);
+
+	list_for_each_entry_safe(e, t, &work_list, w.list) {
+		drbd_free_ee(mdev, e);
+		count++;
+	}
+	return count;
+}
+
+
+/*
+ * This function is called from _asender only_
+ * but see also comments in _req_mod(,barrier_acked)
+ * and receive_Barrier.
+ *
+ * Move entries from net_ee to done_ee, if ready.
+ * Grab done_ee, call all callbacks, free the entries.
+ * The callbacks typically send out ACKs.
+ */
+static int drbd_process_done_ee(struct drbd_conf *mdev)
+{
+	LIST_HEAD(work_list);
+	LIST_HEAD(reclaimed);
+	struct drbd_epoch_entry *e, *t;
+	int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS);
+
+	spin_lock_irq(&mdev->req_lock);
+	reclaim_net_ee(mdev, &reclaimed);
+	list_splice_init(&mdev->done_ee, &work_list);
+	spin_unlock_irq(&mdev->req_lock);
+
+	list_for_each_entry_safe(e, t, &reclaimed, w.list)
+		drbd_free_ee(mdev, e);
+
+	/* possible callbacks here:
+	 * e_end_block, and e_end_resync_block, e_send_discard_ack.
+	 * all ignore the last argument.
+	 */
+	list_for_each_entry_safe(e, t, &work_list, w.list) {
+		trace_drbd_ee(mdev, e, "process_done_ee");
+		/* list_del not necessary, next/prev members not touched */
+		ok = e->w.cb(mdev, &e->w, !ok) && ok;
+		drbd_free_ee(mdev, e);
+	}
+	wake_up(&mdev->ee_wait);
+
+	return ok;
+}
+
+void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
+{
+	DEFINE_WAIT(wait);
+
+	/* avoids spin_lock/unlock
+	 * and calling prepare_to_wait in the fast path */
+	while (!list_empty(head)) {
+		prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock_irq(&mdev->req_lock);
+		drbd_kick_lo(mdev);
+		schedule();
+		finish_wait(&mdev->ee_wait, &wait);
+		spin_lock_irq(&mdev->req_lock);
+	}
+}
+
+void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
+{
+	spin_lock_irq(&mdev->req_lock);
+	_drbd_wait_ee_list_empty(mdev, head);
+	spin_unlock_irq(&mdev->req_lock);
+}
+
+/* see also kernel_accept; which is only present since 2.6.18.
+ * also we want to log which part of it failed, exactly */
+static int drbd_accept(struct drbd_conf *mdev, const char **what,
+		struct socket *sock, struct socket **newsock)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+
+	*what = "listen";
+	err = sock->ops->listen(sock, 5);
+	if (err < 0)
+		goto out;
+
+	*what = "sock_create_lite";
+	err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
+			       newsock);
+	if (err < 0)
+		goto out;
+
+	*what = "accept";
+	err = sock->ops->accept(sock, *newsock, 0);
+	if (err < 0) {
+		sock_release(*newsock);
+		*newsock = NULL;
+		goto out;
+	}
+	(*newsock)->ops  = sock->ops;
+
+out:
+	return err;
+}
+
+static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock,
+		    void *buf, size_t size, int flags)
+{
+	mm_segment_t oldfs;
+	struct kvec iov = {
+		.iov_base = buf,
+		.iov_len = size,
+	};
+	struct msghdr msg = {
+		.msg_iovlen = 1,
+		.msg_iov = (struct iovec *)&iov,
+		.msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
+	};
+	int rv;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	rv = sock_recvmsg(sock, &msg, size, msg.msg_flags);
+	set_fs(oldfs);
+
+	return rv;
+}
+
+static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size)
+{
+	mm_segment_t oldfs;
+	struct kvec iov = {
+		.iov_base = buf,
+		.iov_len = size,
+	};
+	struct msghdr msg = {
+		.msg_iovlen = 1,
+		.msg_iov = (struct iovec *)&iov,
+		.msg_flags = MSG_WAITALL | MSG_NOSIGNAL
+	};
+	int rv;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+
+	for (;;) {
+		rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags);
+		if (rv == size)
+			break;
+
+		/* Note:
+		 * ECONNRESET	other side closed the connection
+		 * ERESTARTSYS	(on  sock) we got a signal
+		 */
+
+		if (rv < 0) {
+			if (rv == -ECONNRESET)
+				dev_info(DEV, "sock was reset by peer\n");
+			else if (rv != -ERESTARTSYS)
+				dev_err(DEV, "sock_recvmsg returned %d\n", rv);
+			break;
+		} else if (rv == 0) {
+			dev_info(DEV, "sock was shut down by peer\n");
+			break;
+		} else	{
+			/* signal came in, or peer/link went down,
+			 * after we read a partial message
+			 */
+			/* D_ASSERT(signal_pending(current)); */
+			break;
+		}
+	};
+
+	set_fs(oldfs);
+
+	if (rv != size)
+		drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
+
+	return rv;
+}
+
+static struct socket *drbd_try_connect(struct drbd_conf *mdev)
+{
+	const char *what;
+	struct socket *sock;
+	struct sockaddr_in6 src_in6;
+	int err;
+	int disconnect_on_error = 1;
+
+	if (!get_net_conf(mdev))
+		return NULL;
+
+	what = "sock_create_kern";
+	err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
+		SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (err < 0) {
+		sock = NULL;
+		goto out;
+	}
+
+	sock->sk->sk_rcvtimeo =
+	sock->sk->sk_sndtimeo =  mdev->net_conf->try_connect_int*HZ;
+
+       /* explicitly bind to the configured IP as source IP
+	*  for the outgoing connections.
+	*  This is needed for multihomed hosts and to be
+	*  able to use lo: interfaces for drbd.
+	* Make sure to use 0 as port number, so linux selects
+	*  a free one dynamically.
+	*/
+	memcpy(&src_in6, mdev->net_conf->my_addr,
+	       min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6)));
+	if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6)
+		src_in6.sin6_port = 0;
+	else
+		((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
+
+	what = "bind before connect";
+	err = sock->ops->bind(sock,
+			      (struct sockaddr *) &src_in6,
+			      mdev->net_conf->my_addr_len);
+	if (err < 0)
+		goto out;
+
+	/* connect may fail, peer not yet available.
+	 * stay C_WF_CONNECTION, don't go Disconnecting! */
+	disconnect_on_error = 0;
+	what = "connect";
+	err = sock->ops->connect(sock,
+				 (struct sockaddr *)mdev->net_conf->peer_addr,
+				 mdev->net_conf->peer_addr_len, 0);
+
+out:
+	if (err < 0) {
+		if (sock) {
+			sock_release(sock);
+			sock = NULL;
+		}
+		switch (-err) {
+			/* timeout, busy, signal pending */
+		case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
+		case EINTR: case ERESTARTSYS:
+			/* peer not (yet) available, network problem */
+		case ECONNREFUSED: case ENETUNREACH:
+		case EHOSTDOWN:    case EHOSTUNREACH:
+			disconnect_on_error = 0;
+			break;
+		default:
+			dev_err(DEV, "%s failed, err = %d\n", what, err);
+		}
+		if (disconnect_on_error)
+			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+	}
+	put_net_conf(mdev);
+	return sock;
+}
+
+static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
+{
+	int timeo, err;
+	struct socket *s_estab = NULL, *s_listen;
+	const char *what;
+
+	if (!get_net_conf(mdev))
+		return NULL;
+
+	what = "sock_create_kern";
+	err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
+		SOCK_STREAM, IPPROTO_TCP, &s_listen);
+	if (err) {
+		s_listen = NULL;
+		goto out;
+	}
+
+	timeo = mdev->net_conf->try_connect_int * HZ;
+	timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
+
+	s_listen->sk->sk_reuse    = 1; /* SO_REUSEADDR */
+	s_listen->sk->sk_rcvtimeo = timeo;
+	s_listen->sk->sk_sndtimeo = timeo;
+
+	what = "bind before listen";
+	err = s_listen->ops->bind(s_listen,
+			      (struct sockaddr *) mdev->net_conf->my_addr,
+			      mdev->net_conf->my_addr_len);
+	if (err < 0)
+		goto out;
+
+	err = drbd_accept(mdev, &what, s_listen, &s_estab);
+
+out:
+	if (s_listen)
+		sock_release(s_listen);
+	if (err < 0) {
+		if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
+			dev_err(DEV, "%s failed, err = %d\n", what, err);
+			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+		}
+	}
+	put_net_conf(mdev);
+
+	return s_estab;
+}
+
+static int drbd_send_fp(struct drbd_conf *mdev,
+	struct socket *sock, enum drbd_packets cmd)
+{
+	struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
+
+	return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
+}
+
+static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
+{
+	struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
+	int rr;
+
+	rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
+
+	if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC)
+		return be16_to_cpu(h->command);
+
+	return 0xffff;
+}
+
+/**
+ * drbd_socket_okay() - Free the socket if its connection is not okay
+ * @mdev:	DRBD device.
+ * @sock:	pointer to the pointer to the socket.
+ */
+static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
+{
+	int rr;
+	char tb[4];
+
+	if (!*sock)
+		return FALSE;
+
+	rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
+
+	if (rr > 0 || rr == -EAGAIN) {
+		return TRUE;
+	} else {
+		sock_release(*sock);
+		*sock = NULL;
+		return FALSE;
+	}
+}
+
+/*
+ * return values:
+ *   1 yes, we have a valid connection
+ *   0 oops, did not work out, please try again
+ *  -1 peer talks different language,
+ *     no point in trying again, please go standalone.
+ *  -2 We do not have a network config...
+ */
+static int drbd_connect(struct drbd_conf *mdev)
+{
+	struct socket *s, *sock, *msock;
+	int try, h, ok;
+
+	D_ASSERT(!mdev->data.socket);
+
+	if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags))
+		dev_err(DEV, "CREATE_BARRIER flag was set in drbd_connect - now cleared!\n");
+
+	if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
+		return -2;
+
+	clear_bit(DISCARD_CONCURRENT, &mdev->flags);
+
+	sock  = NULL;
+	msock = NULL;
+
+	do {
+		for (try = 0;;) {
+			/* 3 tries, this should take less than a second! */
+			s = drbd_try_connect(mdev);
+			if (s || ++try >= 3)
+				break;
+			/* give the other side time to call bind() & listen() */
+			__set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(HZ / 10);
+		}
+
+		if (s) {
+			if (!sock) {
+				drbd_send_fp(mdev, s, P_HAND_SHAKE_S);
+				sock = s;
+				s = NULL;
+			} else if (!msock) {
+				drbd_send_fp(mdev, s, P_HAND_SHAKE_M);
+				msock = s;
+				s = NULL;
+			} else {
+				dev_err(DEV, "Logic error in drbd_connect()\n");
+				goto out_release_sockets;
+			}
+		}
+
+		if (sock && msock) {
+			__set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(HZ / 10);
+			ok = drbd_socket_okay(mdev, &sock);
+			ok = drbd_socket_okay(mdev, &msock) && ok;
+			if (ok)
+				break;
+		}
+
+retry:
+		s = drbd_wait_for_connect(mdev);
+		if (s) {
+			try = drbd_recv_fp(mdev, s);
+			drbd_socket_okay(mdev, &sock);
+			drbd_socket_okay(mdev, &msock);
+			switch (try) {
+			case P_HAND_SHAKE_S:
+				if (sock) {
+					dev_warn(DEV, "initial packet S crossed\n");
+					sock_release(sock);
+				}
+				sock = s;
+				break;
+			case P_HAND_SHAKE_M:
+				if (msock) {
+					dev_warn(DEV, "initial packet M crossed\n");
+					sock_release(msock);
+				}
+				msock = s;
+				set_bit(DISCARD_CONCURRENT, &mdev->flags);
+				break;
+			default:
+				dev_warn(DEV, "Error receiving initial packet\n");
+				sock_release(s);
+				if (random32() & 1)
+					goto retry;
+			}
+		}
+
+		if (mdev->state.conn <= C_DISCONNECTING)
+			goto out_release_sockets;
+		if (signal_pending(current)) {
+			flush_signals(current);
+			smp_rmb();
+			if (get_t_state(&mdev->receiver) == Exiting)
+				goto out_release_sockets;
+		}
+
+		if (sock && msock) {
+			ok = drbd_socket_okay(mdev, &sock);
+			ok = drbd_socket_okay(mdev, &msock) && ok;
+			if (ok)
+				break;
+		}
+	} while (1);
+
+	msock->sk->sk_reuse = 1; /* SO_REUSEADDR */
+	sock->sk->sk_reuse = 1; /* SO_REUSEADDR */
+
+	sock->sk->sk_allocation = GFP_NOIO;
+	msock->sk->sk_allocation = GFP_NOIO;
+
+	sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
+	msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
+
+	if (mdev->net_conf->sndbuf_size) {
+		sock->sk->sk_sndbuf = mdev->net_conf->sndbuf_size;
+		sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+	}
+
+	if (mdev->net_conf->rcvbuf_size) {
+		sock->sk->sk_rcvbuf = mdev->net_conf->rcvbuf_size;
+		sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+	}
+
+	/* NOT YET ...
+	 * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
+	 * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+	 * first set it to the P_HAND_SHAKE timeout,
+	 * which we set to 4x the configured ping_timeout. */
+	sock->sk->sk_sndtimeo =
+	sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10;
+
+	msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
+	msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
+
+	/* we don't want delays.
+	 * we use TCP_CORK where apropriate, though */
+	drbd_tcp_nodelay(sock);
+	drbd_tcp_nodelay(msock);
+
+	mdev->data.socket = sock;
+	mdev->meta.socket = msock;
+	mdev->last_received = jiffies;
+
+	D_ASSERT(mdev->asender.task == NULL);
+
+	h = drbd_do_handshake(mdev);
+	if (h <= 0)
+		return h;
+
+	if (mdev->cram_hmac_tfm) {
+		/* drbd_request_state(mdev, NS(conn, WFAuth)); */
+		if (!drbd_do_auth(mdev)) {
+			dev_err(DEV, "Authentication of peer failed\n");
+			return -1;
+		}
+	}
+
+	if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS)
+		return 0;
+
+	sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
+	sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+
+	atomic_set(&mdev->packet_seq, 0);
+	mdev->peer_seq = 0;
+
+	drbd_thread_start(&mdev->asender);
+
+	drbd_send_protocol(mdev);
+	drbd_send_sync_param(mdev, &mdev->sync_conf);
+	drbd_send_sizes(mdev, 0);
+	drbd_send_uuids(mdev);
+	drbd_send_state(mdev);
+	clear_bit(USE_DEGR_WFC_T, &mdev->flags);
+	clear_bit(RESIZE_PENDING, &mdev->flags);
+
+	return 1;
+
+out_release_sockets:
+	if (sock)
+		sock_release(sock);
+	if (msock)
+		sock_release(msock);
+	return -1;
+}
+
+static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h)
+{
+	int r;
+
+	r = drbd_recv(mdev, h, sizeof(*h));
+
+	if (unlikely(r != sizeof(*h))) {
+		dev_err(DEV, "short read expecting header on sock: r=%d\n", r);
+		return FALSE;
+	};
+	h->command = be16_to_cpu(h->command);
+	h->length  = be16_to_cpu(h->length);
+	if (unlikely(h->magic != BE_DRBD_MAGIC)) {
+		dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n",
+		    (long)be32_to_cpu(h->magic),
+		    h->command, h->length);
+		return FALSE;
+	}
+	mdev->last_received = jiffies;
+
+	return TRUE;
+}
+
+static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
+{
+	int rv;
+
+	if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
+		rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL);
+		if (rv) {
+			dev_err(DEV, "local disk flush failed with status %d\n", rv);
+			/* would rather check on EOPNOTSUPP, but that is not reliable.
+			 * don't try again for ANY return value != 0
+			 * if (rv == -EOPNOTSUPP) */
+			drbd_bump_write_ordering(mdev, WO_drain_io);
+		}
+		put_ldev(mdev);
+	}
+
+	return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
+}
+
+static int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct flush_work *fw = (struct flush_work *)w;
+	struct drbd_epoch *epoch = fw->epoch;
+
+	kfree(w);
+
+	if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags))
+		drbd_flush_after_epoch(mdev, epoch);
+
+	drbd_may_finish_epoch(mdev, epoch, EV_PUT |
+			      (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0));
+
+	return 1;
+}
+
+/**
+ * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
+ * @mdev:	DRBD device.
+ * @epoch:	Epoch object.
+ * @ev:		Epoch event.
+ */
+static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
+					       struct drbd_epoch *epoch,
+					       enum epoch_event ev)
+{
+	int finish, epoch_size;
+	struct drbd_epoch *next_epoch;
+	int schedule_flush = 0;
+	enum finish_epoch rv = FE_STILL_LIVE;
+
+	spin_lock(&mdev->epoch_lock);
+	do {
+		next_epoch = NULL;
+		finish = 0;
+
+		epoch_size = atomic_read(&epoch->epoch_size);
+
+		switch (ev & ~EV_CLEANUP) {
+		case EV_PUT:
+			atomic_dec(&epoch->active);
+			break;
+		case EV_GOT_BARRIER_NR:
+			set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
+
+			/* Special case: If we just switched from WO_bio_barrier to
+			   WO_bdev_flush we should not finish the current epoch */
+			if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 &&
+			    mdev->write_ordering != WO_bio_barrier &&
+			    epoch == mdev->current_epoch)
+				clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags);
+			break;
+		case EV_BARRIER_DONE:
+			set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags);
+			break;
+		case EV_BECAME_LAST:
+			/* nothing to do*/
+			break;
+		}
+
+		trace_drbd_epoch(mdev, epoch, ev);
+
+		if (epoch_size != 0 &&
+		    atomic_read(&epoch->active) == 0 &&
+		    test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) &&
+		    epoch->list.prev == &mdev->current_epoch->list &&
+		    !test_bit(DE_IS_FINISHING, &epoch->flags)) {
+			/* Nearly all conditions are met to finish that epoch... */
+			if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ||
+			    mdev->write_ordering == WO_none ||
+			    (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) ||
+			    ev & EV_CLEANUP) {
+				finish = 1;
+				set_bit(DE_IS_FINISHING, &epoch->flags);
+			} else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) &&
+				 mdev->write_ordering == WO_bio_barrier) {
+				atomic_inc(&epoch->active);
+				schedule_flush = 1;
+			}
+		}
+		if (finish) {
+			if (!(ev & EV_CLEANUP)) {
+				spin_unlock(&mdev->epoch_lock);
+				drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
+				spin_lock(&mdev->epoch_lock);
+			}
+			dec_unacked(mdev);
+
+			if (mdev->current_epoch != epoch) {
+				next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
+				list_del(&epoch->list);
+				ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
+				mdev->epochs--;
+				trace_drbd_epoch(mdev, epoch, EV_TRACE_FREE);
+				kfree(epoch);
+
+				if (rv == FE_STILL_LIVE)
+					rv = FE_DESTROYED;
+			} else {
+				epoch->flags = 0;
+				atomic_set(&epoch->epoch_size, 0);
+				/* atomic_set(&epoch->active, 0); is alrady zero */
+				if (rv == FE_STILL_LIVE)
+					rv = FE_RECYCLED;
+			}
+		}
+
+		if (!next_epoch)
+			break;
+
+		epoch = next_epoch;
+	} while (1);
+
+	spin_unlock(&mdev->epoch_lock);
+
+	if (schedule_flush) {
+		struct flush_work *fw;
+		fw = kmalloc(sizeof(*fw), GFP_ATOMIC);
+		if (fw) {
+			trace_drbd_epoch(mdev, epoch, EV_TRACE_FLUSH);
+			fw->w.cb = w_flush;
+			fw->epoch = epoch;
+			drbd_queue_work(&mdev->data.work, &fw->w);
+		} else {
+			dev_warn(DEV, "Could not kmalloc a flush_work obj\n");
+			set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
+			/* That is not a recursion, only one level */
+			drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
+			drbd_may_finish_epoch(mdev, epoch, EV_PUT);
+		}
+	}
+
+	return rv;
+}
+
+/**
+ * drbd_bump_write_ordering() - Fall back to an other write ordering method
+ * @mdev:	DRBD device.
+ * @wo:		Write ordering method to try.
+ */
+void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local)
+{
+	enum write_ordering_e pwo;
+	static char *write_ordering_str[] = {
+		[WO_none] = "none",
+		[WO_drain_io] = "drain",
+		[WO_bdev_flush] = "flush",
+		[WO_bio_barrier] = "barrier",
+	};
+
+	pwo = mdev->write_ordering;
+	wo = min(pwo, wo);
+	if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier)
+		wo = WO_bdev_flush;
+	if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
+		wo = WO_drain_io;
+	if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
+		wo = WO_none;
+	mdev->write_ordering = wo;
+	if (pwo != mdev->write_ordering || wo == WO_bio_barrier)
+		dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
+}
+
+/**
+ * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set
+ * @mdev:	DRBD device.
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways (unused in this callback)
+ */
+int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
+{
+	struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
+	struct bio *bio = e->private_bio;
+
+	/* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
+	   (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
+	   so that we can finish that epoch in drbd_may_finish_epoch().
+	   That is necessary if we already have a long chain of Epochs, before
+	   we realize that BIO_RW_BARRIER is actually not supported */
+
+	/* As long as the -ENOTSUPP on the barrier is reported immediately
+	   that will never trigger. If it is reported late, we will just
+	   print that warning and continue correctly for all future requests
+	   with WO_bdev_flush */
+	if (previous_epoch(mdev, e->epoch))
+		dev_warn(DEV, "Write ordering was not enforced (one time event)\n");
+
+	/* prepare bio for re-submit,
+	 * re-init volatile members */
+	/* we still have a local reference,
+	 * get_ldev was done in receive_Data. */
+	bio->bi_bdev = mdev->ldev->backing_bdev;
+	bio->bi_sector = e->sector;
+	bio->bi_size = e->size;
+	bio->bi_idx = 0;
+
+	bio->bi_flags &= ~(BIO_POOL_MASK - 1);
+	bio->bi_flags |= 1 << BIO_UPTODATE;
+
+	/* don't know whether this is necessary: */
+	bio->bi_phys_segments = 0;
+	bio->bi_next = NULL;
+
+	/* these should be unchanged: */
+	/* bio->bi_end_io = drbd_endio_write_sec; */
+	/* bio->bi_vcnt = whatever; */
+
+	e->w.cb = e_end_block;
+
+	/* This is no longer a barrier request. */
+	bio->bi_rw &= ~(1UL << BIO_RW_BARRIER);
+
+	drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, bio);
+
+	return 1;
+}
+
+static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h)
+{
+	int rv, issue_flush;
+	struct p_barrier *p = (struct p_barrier *)h;
+	struct drbd_epoch *epoch;
+
+	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
+
+	rv = drbd_recv(mdev, h->payload, h->length);
+	ERR_IF(rv != h->length) return FALSE;
+
+	inc_unacked(mdev);
+
+	if (mdev->net_conf->wire_protocol != DRBD_PROT_C)
+		drbd_kick_lo(mdev);
+
+	mdev->current_epoch->barrier_nr = p->barrier;
+	rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR);
+
+	/* P_BARRIER_ACK may imply that the corresponding extent is dropped from
+	 * the activity log, which means it would not be resynced in case the
+	 * R_PRIMARY crashes now.
+	 * Therefore we must send the barrier_ack after the barrier request was
+	 * completed. */
+	switch (mdev->write_ordering) {
+	case WO_bio_barrier:
+	case WO_none:
+		if (rv == FE_RECYCLED)
+			return TRUE;
+		break;
+
+	case WO_bdev_flush:
+	case WO_drain_io:
+		D_ASSERT(rv == FE_STILL_LIVE);
+		set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
+		drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
+		rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
+		if (rv == FE_RECYCLED)
+			return TRUE;
+
+		/* The asender will send all the ACKs and barrier ACKs out, since
+		   all EEs moved from the active_ee to the done_ee. We need to
+		   provide a new epoch object for the EEs that come in soon */
+		break;
+	}
+
+	/* receiver context, in the writeout path of the other node.
+	 * avoid potential distributed deadlock */
+	epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
+	if (!epoch) {
+		dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
+		issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
+		drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
+		if (issue_flush) {
+			rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
+			if (rv == FE_RECYCLED)
+				return TRUE;
+		}
+
+		drbd_wait_ee_list_empty(mdev, &mdev->done_ee);
+
+		return TRUE;
+	}
+
+	epoch->flags = 0;
+	atomic_set(&epoch->epoch_size, 0);
+	atomic_set(&epoch->active, 0);
+
+	spin_lock(&mdev->epoch_lock);
+	if (atomic_read(&mdev->current_epoch->epoch_size)) {
+		list_add(&epoch->list, &mdev->current_epoch->list);
+		mdev->current_epoch = epoch;
+		mdev->epochs++;
+		trace_drbd_epoch(mdev, epoch, EV_TRACE_ALLOC);
+	} else {
+		/* The current_epoch got recycled while we allocated this one... */
+		kfree(epoch);
+	}
+	spin_unlock(&mdev->epoch_lock);
+
+	return TRUE;
+}
+
+/* used from receive_RSDataReply (recv_resync_read)
+ * and from receive_Data */
+static struct drbd_epoch_entry *
+read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local)
+{
+	struct drbd_epoch_entry *e;
+	struct bio_vec *bvec;
+	struct page *page;
+	struct bio *bio;
+	int dgs, ds, i, rr;
+	void *dig_in = mdev->int_dig_in;
+	void *dig_vv = mdev->int_dig_vv;
+
+	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
+		crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
+
+	if (dgs) {
+		rr = drbd_recv(mdev, dig_in, dgs);
+		if (rr != dgs) {
+			dev_warn(DEV, "short read receiving data digest: read %d expected %d\n",
+			     rr, dgs);
+			return NULL;
+		}
+	}
+
+	data_size -= dgs;
+
+	ERR_IF(data_size &  0x1ff) return NULL;
+	ERR_IF(data_size >  DRBD_MAX_SEGMENT_SIZE) return NULL;
+
+	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
+	 * "criss-cross" setup, that might cause write-out on some other DRBD,
+	 * which in turn might block on the other node at this very place.  */
+	e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO);
+	if (!e)
+		return NULL;
+	bio = e->private_bio;
+	ds = data_size;
+	bio_for_each_segment(bvec, bio, i) {
+		page = bvec->bv_page;
+		rr = drbd_recv(mdev, kmap(page), min_t(int, ds, PAGE_SIZE));
+		kunmap(page);
+		if (rr != min_t(int, ds, PAGE_SIZE)) {
+			drbd_free_ee(mdev, e);
+			dev_warn(DEV, "short read receiving data: read %d expected %d\n",
+			     rr, min_t(int, ds, PAGE_SIZE));
+			return NULL;
+		}
+		ds -= rr;
+	}
+
+	if (dgs) {
+		drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv);
+		if (memcmp(dig_in, dig_vv, dgs)) {
+			dev_err(DEV, "Digest integrity check FAILED.\n");
+			drbd_bcast_ee(mdev, "digest failed",
+					dgs, dig_in, dig_vv, e);
+			drbd_free_ee(mdev, e);
+			return NULL;
+		}
+	}
+	mdev->recv_cnt += data_size>>9;
+	return e;
+}
+
+/* drbd_drain_block() just takes a data block
+ * out of the socket input buffer, and discards it.
+ */
+static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
+{
+	struct page *page;
+	int rr, rv = 1;
+	void *data;
+
+	page = drbd_pp_alloc(mdev, 1);
+
+	data = kmap(page);
+	while (data_size) {
+		rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE));
+		if (rr != min_t(int, data_size, PAGE_SIZE)) {
+			rv = 0;
+			dev_warn(DEV, "short read receiving data: read %d expected %d\n",
+			     rr, min_t(int, data_size, PAGE_SIZE));
+			break;
+		}
+		data_size -= rr;
+	}
+	kunmap(page);
+	drbd_pp_free(mdev, page);
+	return rv;
+}
+
+static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
+			   sector_t sector, int data_size)
+{
+	struct bio_vec *bvec;
+	struct bio *bio;
+	int dgs, rr, i, expect;
+	void *dig_in = mdev->int_dig_in;
+	void *dig_vv = mdev->int_dig_vv;
+
+	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
+		crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
+
+	if (dgs) {
+		rr = drbd_recv(mdev, dig_in, dgs);
+		if (rr != dgs) {
+			dev_warn(DEV, "short read receiving data reply digest: read %d expected %d\n",
+			     rr, dgs);
+			return 0;
+		}
+	}
+
+	data_size -= dgs;
+
+	/* optimistically update recv_cnt.  if receiving fails below,
+	 * we disconnect anyways, and counters will be reset. */
+	mdev->recv_cnt += data_size>>9;
+
+	bio = req->master_bio;
+	D_ASSERT(sector == bio->bi_sector);
+
+	bio_for_each_segment(bvec, bio, i) {
+		expect = min_t(int, data_size, bvec->bv_len);
+		rr = drbd_recv(mdev,
+			     kmap(bvec->bv_page)+bvec->bv_offset,
+			     expect);
+		kunmap(bvec->bv_page);
+		if (rr != expect) {
+			dev_warn(DEV, "short read receiving data reply: "
+			     "read %d expected %d\n",
+			     rr, expect);
+			return 0;
+		}
+		data_size -= rr;
+	}
+
+	if (dgs) {
+		drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv);
+		if (memcmp(dig_in, dig_vv, dgs)) {
+			dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
+			return 0;
+		}
+	}
+
+	D_ASSERT(data_size == 0);
+	return 1;
+}
+
+/* e_end_resync_block() is called via
+ * drbd_process_done_ee() by asender only */
+static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+	struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
+	sector_t sector = e->sector;
+	int ok;
+
+	D_ASSERT(hlist_unhashed(&e->colision));
+
+	if (likely(drbd_bio_uptodate(e->private_bio))) {
+		drbd_set_in_sync(mdev, sector, e->size);
+		ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e);
+	} else {
+		/* Record failure to sync */
+		drbd_rs_failed_io(mdev, sector, e->size);
+
+		ok  = drbd_send_ack(mdev, P_NEG_ACK, e);
+	}
+	dec_unacked(mdev);
+
+	return ok;
+}
+
+static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local)
+{
+	struct drbd_epoch_entry *e;
+
+	e = read_in_block(mdev, ID_SYNCER, sector, data_size);
+	if (!e) {
+		put_ldev(mdev);
+		return FALSE;
+	}
+
+	dec_rs_pending(mdev);
+
+	e->private_bio->bi_end_io = drbd_endio_write_sec;
+	e->private_bio->bi_rw = WRITE;
+	e->w.cb = e_end_resync_block;
+
+	inc_unacked(mdev);
+	/* corresponding dec_unacked() in e_end_resync_block()
+	 * respective _drbd_clear_done_ee */
+
+	spin_lock_irq(&mdev->req_lock);
+	list_add(&e->w.list, &mdev->sync_ee);
+	spin_unlock_irq(&mdev->req_lock);
+
+	trace_drbd_ee(mdev, e, "submitting for (rs)write");
+	trace_drbd_bio(mdev, "Sec", e->private_bio, 0, NULL);
+	drbd_generic_make_request(mdev, DRBD_FAULT_RS_WR, e->private_bio);
+	/* accounting done in endio */
+
+	maybe_kick_lo(mdev);
+	return TRUE;
+}
+
+static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct drbd_request *req;
+	sector_t sector;
+	unsigned int header_size, data_size;
+	int ok;
+	struct p_data *p = (struct p_data *)h;
+
+	header_size = sizeof(*p) - sizeof(*h);
+	data_size   = h->length  - header_size;
+
+	ERR_IF(data_size == 0) return FALSE;
+
+	if (drbd_recv(mdev, h->payload, header_size) != header_size)
+		return FALSE;
+
+	sector = be64_to_cpu(p->sector);
+
+	spin_lock_irq(&mdev->req_lock);
+	req = _ar_id_to_req(mdev, p->block_id, sector);
+	spin_unlock_irq(&mdev->req_lock);
+	if (unlikely(!req)) {
+		dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n");
+		return FALSE;
+	}
+
+	/* hlist_del(&req->colision) is done in _req_may_be_done, to avoid
+	 * special casing it there for the various failure cases.
+	 * still no race with drbd_fail_pending_reads */
+	ok = recv_dless_read(mdev, req, sector, data_size);
+
+	if (ok)
+		req_mod(req, data_received);
+	/* else: nothing. handled from drbd_disconnect...
+	 * I don't think we may complete this just yet
+	 * in case we are "on-disconnect: freeze" */
+
+	return ok;
+}
+
+static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h)
+{
+	sector_t sector;
+	unsigned int header_size, data_size;
+	int ok;
+	struct p_data *p = (struct p_data *)h;
+
+	header_size = sizeof(*p) - sizeof(*h);
+	data_size   = h->length  - header_size;
+
+	ERR_IF(data_size == 0) return FALSE;
+
+	if (drbd_recv(mdev, h->payload, header_size) != header_size)
+		return FALSE;
+
+	sector = be64_to_cpu(p->sector);
+	D_ASSERT(p->block_id == ID_SYNCER);
+
+	if (get_ldev(mdev)) {
+		/* data is submitted to disk within recv_resync_read.
+		 * corresponding put_ldev done below on error,
+		 * or in drbd_endio_write_sec. */
+		ok = recv_resync_read(mdev, sector, data_size);
+	} else {
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_err(DEV, "Can not write resync data to local disk.\n");
+
+		ok = drbd_drain_block(mdev, data_size);
+
+		drbd_send_ack_dp(mdev, P_NEG_ACK, p);
+	}
+
+	return ok;
+}
+
+/* e_end_block() is called via drbd_process_done_ee().
+ * this means this function only runs in the asender thread
+ */
+static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
+	sector_t sector = e->sector;
+	struct drbd_epoch *epoch;
+	int ok = 1, pcmd;
+
+	if (e->flags & EE_IS_BARRIER) {
+		epoch = previous_epoch(mdev, e->epoch);
+		if (epoch)
+			drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0));
+	}
+
+	if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
+		if (likely(drbd_bio_uptodate(e->private_bio))) {
+			pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
+				mdev->state.conn <= C_PAUSED_SYNC_T &&
+				e->flags & EE_MAY_SET_IN_SYNC) ?
+				P_RS_WRITE_ACK : P_WRITE_ACK;
+			ok &= drbd_send_ack(mdev, pcmd, e);
+			if (pcmd == P_RS_WRITE_ACK)
+				drbd_set_in_sync(mdev, sector, e->size);
+		} else {
+			ok  = drbd_send_ack(mdev, P_NEG_ACK, e);
+			/* we expect it to be marked out of sync anyways...
+			 * maybe assert this?  */
+		}
+		dec_unacked(mdev);
+	}
+	/* we delete from the conflict detection hash _after_ we sent out the
+	 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right.  */
+	if (mdev->net_conf->two_primaries) {
+		spin_lock_irq(&mdev->req_lock);
+		D_ASSERT(!hlist_unhashed(&e->colision));
+		hlist_del_init(&e->colision);
+		spin_unlock_irq(&mdev->req_lock);
+	} else {
+		D_ASSERT(hlist_unhashed(&e->colision));
+	}
+
+	drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
+
+	return ok;
+}
+
+static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+	struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
+	int ok = 1;
+
+	D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
+	ok = drbd_send_ack(mdev, P_DISCARD_ACK, e);
+
+	spin_lock_irq(&mdev->req_lock);
+	D_ASSERT(!hlist_unhashed(&e->colision));
+	hlist_del_init(&e->colision);
+	spin_unlock_irq(&mdev->req_lock);
+
+	dec_unacked(mdev);
+
+	return ok;
+}
+
+/* Called from receive_Data.
+ * Synchronize packets on sock with packets on msock.
+ *
+ * This is here so even when a P_DATA packet traveling via sock overtook an Ack
+ * packet traveling on msock, they are still processed in the order they have
+ * been sent.
+ *
+ * Note: we don't care for Ack packets overtaking P_DATA packets.
+ *
+ * In case packet_seq is larger than mdev->peer_seq number, there are
+ * outstanding packets on the msock. We wait for them to arrive.
+ * In case we are the logically next packet, we update mdev->peer_seq
+ * ourselves. Correctly handles 32bit wrap around.
+ *
+ * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
+ * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
+ * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
+ * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
+ *
+ * returns 0 if we may process the packet,
+ * -ERESTARTSYS if we were interrupted (by disconnect signal). */
+static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
+{
+	DEFINE_WAIT(wait);
+	unsigned int p_seq;
+	long timeout;
+	int ret = 0;
+	spin_lock(&mdev->peer_seq_lock);
+	for (;;) {
+		prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
+		if (seq_le(packet_seq, mdev->peer_seq+1))
+			break;
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+		p_seq = mdev->peer_seq;
+		spin_unlock(&mdev->peer_seq_lock);
+		timeout = schedule_timeout(30*HZ);
+		spin_lock(&mdev->peer_seq_lock);
+		if (timeout == 0 && p_seq == mdev->peer_seq) {
+			ret = -ETIMEDOUT;
+			dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n");
+			break;
+		}
+	}
+	finish_wait(&mdev->seq_wait, &wait);
+	if (mdev->peer_seq+1 == packet_seq)
+		mdev->peer_seq++;
+	spin_unlock(&mdev->peer_seq_lock);
+	return ret;
+}
+
+/* mirrored write */
+static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
+{
+	sector_t sector;
+	struct drbd_epoch_entry *e;
+	struct p_data *p = (struct p_data *)h;
+	int header_size, data_size;
+	int rw = WRITE;
+	u32 dp_flags;
+
+	header_size = sizeof(*p) - sizeof(*h);
+	data_size   = h->length  - header_size;
+
+	ERR_IF(data_size == 0) return FALSE;
+
+	if (drbd_recv(mdev, h->payload, header_size) != header_size)
+		return FALSE;
+
+	if (!get_ldev(mdev)) {
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_err(DEV, "Can not write mirrored data block "
+			    "to local disk.\n");
+		spin_lock(&mdev->peer_seq_lock);
+		if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num))
+			mdev->peer_seq++;
+		spin_unlock(&mdev->peer_seq_lock);
+
+		drbd_send_ack_dp(mdev, P_NEG_ACK, p);
+		atomic_inc(&mdev->current_epoch->epoch_size);
+		return drbd_drain_block(mdev, data_size);
+	}
+
+	/* get_ldev(mdev) successful.
+	 * Corresponding put_ldev done either below (on various errors),
+	 * or in drbd_endio_write_sec, if we successfully submit the data at
+	 * the end of this function. */
+
+	sector = be64_to_cpu(p->sector);
+	e = read_in_block(mdev, p->block_id, sector, data_size);
+	if (!e) {
+		put_ldev(mdev);
+		return FALSE;
+	}
+
+	e->private_bio->bi_end_io = drbd_endio_write_sec;
+	e->w.cb = e_end_block;
+
+	spin_lock(&mdev->epoch_lock);
+	e->epoch = mdev->current_epoch;
+	atomic_inc(&e->epoch->epoch_size);
+	atomic_inc(&e->epoch->active);
+
+	if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) {
+		struct drbd_epoch *epoch;
+		/* Issue a barrier if we start a new epoch, and the previous epoch
+		   was not a epoch containing a single request which already was
+		   a Barrier. */
+		epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list);
+		if (epoch == e->epoch) {
+			set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
+			trace_drbd_epoch(mdev, e->epoch, EV_TRACE_ADD_BARRIER);
+			rw |= (1<<BIO_RW_BARRIER);
+			e->flags |= EE_IS_BARRIER;
+		} else {
+			if (atomic_read(&epoch->epoch_size) > 1 ||
+			    !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) {
+				set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
+				trace_drbd_epoch(mdev, epoch, EV_TRACE_SETTING_BI);
+				set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
+				trace_drbd_epoch(mdev, e->epoch, EV_TRACE_ADD_BARRIER);
+				rw |= (1<<BIO_RW_BARRIER);
+				e->flags |= EE_IS_BARRIER;
+			}
+		}
+	}
+	spin_unlock(&mdev->epoch_lock);
+
+	dp_flags = be32_to_cpu(p->dp_flags);
+	if (dp_flags & DP_HARDBARRIER) {
+		dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n");
+		/* rw |= (1<<BIO_RW_BARRIER); */
+	}
+	if (dp_flags & DP_RW_SYNC)
+		rw |= (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG);
+	if (dp_flags & DP_MAY_SET_IN_SYNC)
+		e->flags |= EE_MAY_SET_IN_SYNC;
+
+	/* I'm the receiver, I do hold a net_cnt reference. */
+	if (!mdev->net_conf->two_primaries) {
+		spin_lock_irq(&mdev->req_lock);
+	} else {
+		/* don't get the req_lock yet,
+		 * we may sleep in drbd_wait_peer_seq */
+		const int size = e->size;
+		const int discard = test_bit(DISCARD_CONCURRENT, &mdev->flags);
+		DEFINE_WAIT(wait);
+		struct drbd_request *i;
+		struct hlist_node *n;
+		struct hlist_head *slot;
+		int first;
+
+		D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
+		BUG_ON(mdev->ee_hash == NULL);
+		BUG_ON(mdev->tl_hash == NULL);
+
+		/* conflict detection and handling:
+		 * 1. wait on the sequence number,
+		 *    in case this data packet overtook ACK packets.
+		 * 2. check our hash tables for conflicting requests.
+		 *    we only need to walk the tl_hash, since an ee can not
+		 *    have a conflict with an other ee: on the submitting
+		 *    node, the corresponding req had already been conflicting,
+		 *    and a conflicting req is never sent.
+		 *
+		 * Note: for two_primaries, we are protocol C,
+		 * so there cannot be any request that is DONE
+		 * but still on the transfer log.
+		 *
+		 * unconditionally add to the ee_hash.
+		 *
+		 * if no conflicting request is found:
+		 *    submit.
+		 *
+		 * if any conflicting request is found
+		 * that has not yet been acked,
+		 * AND I have the "discard concurrent writes" flag:
+		 *	 queue (via done_ee) the P_DISCARD_ACK; OUT.
+		 *
+		 * if any conflicting request is found:
+		 *	 block the receiver, waiting on misc_wait
+		 *	 until no more conflicting requests are there,
+		 *	 or we get interrupted (disconnect).
+		 *
+		 *	 we do not just write after local io completion of those
+		 *	 requests, but only after req is done completely, i.e.
+		 *	 we wait for the P_DISCARD_ACK to arrive!
+		 *
+		 *	 then proceed normally, i.e. submit.
+		 */
+		if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num)))
+			goto out_interrupted;
+
+		spin_lock_irq(&mdev->req_lock);
+
+		hlist_add_head(&e->colision, ee_hash_slot(mdev, sector));
+
+#define OVERLAPS overlaps(i->sector, i->size, sector, size)
+		slot = tl_hash_slot(mdev, sector);
+		first = 1;
+		for (;;) {
+			int have_unacked = 0;
+			int have_conflict = 0;
+			prepare_to_wait(&mdev->misc_wait, &wait,
+				TASK_INTERRUPTIBLE);
+			hlist_for_each_entry(i, n, slot, colision) {
+				if (OVERLAPS) {
+					/* only ALERT on first iteration,
+					 * we may be woken up early... */
+					if (first)
+						dev_alert(DEV, "%s[%u] Concurrent local write detected!"
+						      "	new: %llus +%u; pending: %llus +%u\n",
+						      current->comm, current->pid,
+						      (unsigned long long)sector, size,
+						      (unsigned long long)i->sector, i->size);
+					if (i->rq_state & RQ_NET_PENDING)
+						++have_unacked;
+					++have_conflict;
+				}
+			}
+#undef OVERLAPS
+			if (!have_conflict)
+				break;
+
+			/* Discard Ack only for the _first_ iteration */
+			if (first && discard && have_unacked) {
+				dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n",
+				     (unsigned long long)sector);
+				inc_unacked(mdev);
+				e->w.cb = e_send_discard_ack;
+				list_add_tail(&e->w.list, &mdev->done_ee);
+
+				spin_unlock_irq(&mdev->req_lock);
+
+				/* we could probably send that P_DISCARD_ACK ourselves,
+				 * but I don't like the receiver using the msock */
+
+				put_ldev(mdev);
+				wake_asender(mdev);
+				finish_wait(&mdev->misc_wait, &wait);
+				return TRUE;
+			}
+
+			if (signal_pending(current)) {
+				hlist_del_init(&e->colision);
+
+				spin_unlock_irq(&mdev->req_lock);
+
+				finish_wait(&mdev->misc_wait, &wait);
+				goto out_interrupted;
+			}
+
+			spin_unlock_irq(&mdev->req_lock);
+			if (first) {
+				first = 0;
+				dev_alert(DEV, "Concurrent write! [W AFTERWARDS] "
+				     "sec=%llus\n", (unsigned long long)sector);
+			} else if (discard) {
+				/* we had none on the first iteration.
+				 * there must be none now. */
+				D_ASSERT(have_unacked == 0);
+			}
+			schedule();
+			spin_lock_irq(&mdev->req_lock);
+		}
+		finish_wait(&mdev->misc_wait, &wait);
+	}
+
+	list_add(&e->w.list, &mdev->active_ee);
+	spin_unlock_irq(&mdev->req_lock);
+
+	switch (mdev->net_conf->wire_protocol) {
+	case DRBD_PROT_C:
+		inc_unacked(mdev);
+		/* corresponding dec_unacked() in e_end_block()
+		 * respective _drbd_clear_done_ee */
+		break;
+	case DRBD_PROT_B:
+		/* I really don't like it that the receiver thread
+		 * sends on the msock, but anyways */
+		drbd_send_ack(mdev, P_RECV_ACK, e);
+		break;
+	case DRBD_PROT_A:
+		/* nothing to do */
+		break;
+	}
+
+	if (mdev->state.pdsk == D_DISKLESS) {
+		/* In case we have the only disk of the cluster, */
+		drbd_set_out_of_sync(mdev, e->sector, e->size);
+		e->flags |= EE_CALL_AL_COMPLETE_IO;
+		drbd_al_begin_io(mdev, e->sector);
+	}
+
+	e->private_bio->bi_rw = rw;
+	trace_drbd_ee(mdev, e, "submitting for (data)write");
+	trace_drbd_bio(mdev, "Sec", e->private_bio, 0, NULL);
+	drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, e->private_bio);
+	/* accounting done in endio */
+
+	maybe_kick_lo(mdev);
+	return TRUE;
+
+out_interrupted:
+	/* yes, the epoch_size now is imbalanced.
+	 * but we drop the connection anyways, so we don't have a chance to
+	 * receive a barrier... atomic_inc(&mdev->epoch_size); */
+	put_ldev(mdev);
+	drbd_free_ee(mdev, e);
+	return FALSE;
+}
+
+static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
+{
+	sector_t sector;
+	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
+	struct drbd_epoch_entry *e;
+	struct digest_info *di = NULL;
+	int size, digest_size;
+	unsigned int fault_type;
+	struct p_block_req *p =
+		(struct p_block_req *)h;
+	const int brps = sizeof(*p)-sizeof(*h);
+
+	if (drbd_recv(mdev, h->payload, brps) != brps)
+		return FALSE;
+
+	sector = be64_to_cpu(p->sector);
+	size   = be32_to_cpu(p->blksize);
+
+	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
+		dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
+				(unsigned long long)sector, size);
+		return FALSE;
+	}
+	if (sector + (size>>9) > capacity) {
+		dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
+				(unsigned long long)sector, size);
+		return FALSE;
+	}
+
+	if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_err(DEV, "Can not satisfy peer's read request, "
+			    "no local data.\n");
+		drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY :
+				 P_NEG_RS_DREPLY , p);
+		return TRUE;
+	}
+
+	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
+	 * "criss-cross" setup, that might cause write-out on some other DRBD,
+	 * which in turn might block on the other node at this very place.  */
+	e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO);
+	if (!e) {
+		put_ldev(mdev);
+		return FALSE;
+	}
+
+	e->private_bio->bi_rw = READ;
+	e->private_bio->bi_end_io = drbd_endio_read_sec;
+
+	switch (h->command) {
+	case P_DATA_REQUEST:
+		e->w.cb = w_e_end_data_req;
+		fault_type = DRBD_FAULT_DT_RD;
+		break;
+	case P_RS_DATA_REQUEST:
+		e->w.cb = w_e_end_rsdata_req;
+		fault_type = DRBD_FAULT_RS_RD;
+		/* Eventually this should become asynchronously. Currently it
+		 * blocks the whole receiver just to delay the reading of a
+		 * resync data block.
+		 * the drbd_work_queue mechanism is made for this...
+		 */
+		if (!drbd_rs_begin_io(mdev, sector)) {
+			/* we have been interrupted,
+			 * probably connection lost! */
+			D_ASSERT(signal_pending(current));
+			goto out_free_e;
+		}
+		break;
+
+	case P_OV_REPLY:
+	case P_CSUM_RS_REQUEST:
+		fault_type = DRBD_FAULT_RS_RD;
+		digest_size = h->length - brps ;
+		di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
+		if (!di)
+			goto out_free_e;
+
+		di->digest_size = digest_size;
+		di->digest = (((char *)di)+sizeof(struct digest_info));
+
+		if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
+			goto out_free_e;
+
+		e->block_id = (u64)(unsigned long)di;
+		if (h->command == P_CSUM_RS_REQUEST) {
+			D_ASSERT(mdev->agreed_pro_version >= 89);
+			e->w.cb = w_e_end_csum_rs_req;
+		} else if (h->command == P_OV_REPLY) {
+			e->w.cb = w_e_end_ov_reply;
+			dec_rs_pending(mdev);
+			break;
+		}
+
+		if (!drbd_rs_begin_io(mdev, sector)) {
+			/* we have been interrupted, probably connection lost! */
+			D_ASSERT(signal_pending(current));
+			goto out_free_e;
+		}
+		break;
+
+	case P_OV_REQUEST:
+		if (mdev->state.conn >= C_CONNECTED &&
+		    mdev->state.conn != C_VERIFY_T)
+			dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n",
+				drbd_conn_str(mdev->state.conn));
+		if (mdev->ov_start_sector == ~(sector_t)0 &&
+		    mdev->agreed_pro_version >= 90) {
+			mdev->ov_start_sector = sector;
+			mdev->ov_position = sector;
+			mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(sector);
+			dev_info(DEV, "Online Verify start sector: %llu\n",
+					(unsigned long long)sector);
+		}
+		e->w.cb = w_e_end_ov_req;
+		fault_type = DRBD_FAULT_RS_RD;
+		/* Eventually this should become asynchronous. Currently it
+		 * blocks the whole receiver just to delay the reading of a
+		 * resync data block.
+		 * the drbd_work_queue mechanism is made for this...
+		 */
+		if (!drbd_rs_begin_io(mdev, sector)) {
+			/* we have been interrupted,
+			 * probably connection lost! */
+			D_ASSERT(signal_pending(current));
+			goto out_free_e;
+		}
+		break;
+
+
+	default:
+		dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
+		    cmdname(h->command));
+		fault_type = DRBD_FAULT_MAX;
+	}
+
+	spin_lock_irq(&mdev->req_lock);
+	list_add(&e->w.list, &mdev->read_ee);
+	spin_unlock_irq(&mdev->req_lock);
+
+	inc_unacked(mdev);
+
+	trace_drbd_ee(mdev, e, "submitting for read");
+	trace_drbd_bio(mdev, "Sec", e->private_bio, 0, NULL);
+	drbd_generic_make_request(mdev, fault_type, e->private_bio);
+	maybe_kick_lo(mdev);
+
+	return TRUE;
+
+out_free_e:
+	kfree(di);
+	put_ldev(mdev);
+	drbd_free_ee(mdev, e);
+	return FALSE;
+}
+
+static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
+{
+	int self, peer, rv = -100;
+	unsigned long ch_self, ch_peer;
+
+	self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
+	peer = mdev->p_uuid[UI_BITMAP] & 1;
+
+	ch_peer = mdev->p_uuid[UI_SIZE];
+	ch_self = mdev->comm_bm_set;
+
+	switch (mdev->net_conf->after_sb_0p) {
+	case ASB_CONSENSUS:
+	case ASB_DISCARD_SECONDARY:
+	case ASB_CALL_HELPER:
+		dev_err(DEV, "Configuration error.\n");
+		break;
+	case ASB_DISCONNECT:
+		break;
+	case ASB_DISCARD_YOUNGER_PRI:
+		if (self == 0 && peer == 1) {
+			rv = -1;
+			break;
+		}
+		if (self == 1 && peer == 0) {
+			rv =  1;
+			break;
+		}
+		/* Else fall through to one of the other strategies... */
+	case ASB_DISCARD_OLDER_PRI:
+		if (self == 0 && peer == 1) {
+			rv = 1;
+			break;
+		}
+		if (self == 1 && peer == 0) {
+			rv = -1;
+			break;
+		}
+		/* Else fall through to one of the other strategies... */
+		dev_warn(DEV, "Discard younger/older primary did not found a decision\n"
+		     "Using discard-least-changes instead\n");
+	case ASB_DISCARD_ZERO_CHG:
+		if (ch_peer == 0 && ch_self == 0) {
+			rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
+				? -1 : 1;
+			break;
+		} else {
+			if (ch_peer == 0) { rv =  1; break; }
+			if (ch_self == 0) { rv = -1; break; }
+		}
+		if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG)
+			break;
+	case ASB_DISCARD_LEAST_CHG:
+		if	(ch_self < ch_peer)
+			rv = -1;
+		else if (ch_self > ch_peer)
+			rv =  1;
+		else /* ( ch_self == ch_peer ) */
+		     /* Well, then use something else. */
+			rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
+				? -1 : 1;
+		break;
+	case ASB_DISCARD_LOCAL:
+		rv = -1;
+		break;
+	case ASB_DISCARD_REMOTE:
+		rv =  1;
+	}
+
+	return rv;
+}
+
+static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
+{
+	int self, peer, hg, rv = -100;
+
+	self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
+	peer = mdev->p_uuid[UI_BITMAP] & 1;
+
+	switch (mdev->net_conf->after_sb_1p) {
+	case ASB_DISCARD_YOUNGER_PRI:
+	case ASB_DISCARD_OLDER_PRI:
+	case ASB_DISCARD_LEAST_CHG:
+	case ASB_DISCARD_LOCAL:
+	case ASB_DISCARD_REMOTE:
+		dev_err(DEV, "Configuration error.\n");
+		break;
+	case ASB_DISCONNECT:
+		break;
+	case ASB_CONSENSUS:
+		hg = drbd_asb_recover_0p(mdev);
+		if (hg == -1 && mdev->state.role == R_SECONDARY)
+			rv = hg;
+		if (hg == 1  && mdev->state.role == R_PRIMARY)
+			rv = hg;
+		break;
+	case ASB_VIOLENTLY:
+		rv = drbd_asb_recover_0p(mdev);
+		break;
+	case ASB_DISCARD_SECONDARY:
+		return mdev->state.role == R_PRIMARY ? 1 : -1;
+	case ASB_CALL_HELPER:
+		hg = drbd_asb_recover_0p(mdev);
+		if (hg == -1 && mdev->state.role == R_PRIMARY) {
+			self = drbd_set_role(mdev, R_SECONDARY, 0);
+			 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
+			  * we might be here in C_WF_REPORT_PARAMS which is transient.
+			  * we do not need to wait for the after state change work either. */
+			self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
+			if (self != SS_SUCCESS) {
+				drbd_khelper(mdev, "pri-lost-after-sb");
+			} else {
+				dev_warn(DEV, "Successfully gave up primary role.\n");
+				rv = hg;
+			}
+		} else
+			rv = hg;
+	}
+
+	return rv;
+}
+
+static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
+{
+	int self, peer, hg, rv = -100;
+
+	self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
+	peer = mdev->p_uuid[UI_BITMAP] & 1;
+
+	switch (mdev->net_conf->after_sb_2p) {
+	case ASB_DISCARD_YOUNGER_PRI:
+	case ASB_DISCARD_OLDER_PRI:
+	case ASB_DISCARD_LEAST_CHG:
+	case ASB_DISCARD_LOCAL:
+	case ASB_DISCARD_REMOTE:
+	case ASB_CONSENSUS:
+	case ASB_DISCARD_SECONDARY:
+		dev_err(DEV, "Configuration error.\n");
+		break;
+	case ASB_VIOLENTLY:
+		rv = drbd_asb_recover_0p(mdev);
+		break;
+	case ASB_DISCONNECT:
+		break;
+	case ASB_CALL_HELPER:
+		hg = drbd_asb_recover_0p(mdev);
+		if (hg == -1) {
+			 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
+			  * we might be here in C_WF_REPORT_PARAMS which is transient.
+			  * we do not need to wait for the after state change work either. */
+			self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
+			if (self != SS_SUCCESS) {
+				drbd_khelper(mdev, "pri-lost-after-sb");
+			} else {
+				dev_warn(DEV, "Successfully gave up primary role.\n");
+				rv = hg;
+			}
+		} else
+			rv = hg;
+	}
+
+	return rv;
+}
+
+static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid,
+			   u64 bits, u64 flags)
+{
+	if (!uuid) {
+		dev_info(DEV, "%s uuid info vanished while I was looking!\n", text);
+		return;
+	}
+	dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
+	     text,
+	     (unsigned long long)uuid[UI_CURRENT],
+	     (unsigned long long)uuid[UI_BITMAP],
+	     (unsigned long long)uuid[UI_HISTORY_START],
+	     (unsigned long long)uuid[UI_HISTORY_END],
+	     (unsigned long long)bits,
+	     (unsigned long long)flags);
+}
+
+/*
+  100	after split brain try auto recover
+    2	C_SYNC_SOURCE set BitMap
+    1	C_SYNC_SOURCE use BitMap
+    0	no Sync
+   -1	C_SYNC_TARGET use BitMap
+   -2	C_SYNC_TARGET set BitMap
+ -100	after split brain, disconnect
+-1000	unrelated data
+ */
+static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local)
+{
+	u64 self, peer;
+	int i, j;
+
+	self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
+	peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
+
+	*rule_nr = 10;
+	if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
+		return 0;
+
+	*rule_nr = 20;
+	if ((self == UUID_JUST_CREATED || self == (u64)0) &&
+	     peer != UUID_JUST_CREATED)
+		return -2;
+
+	*rule_nr = 30;
+	if (self != UUID_JUST_CREATED &&
+	    (peer == UUID_JUST_CREATED || peer == (u64)0))
+		return 2;
+
+	if (self == peer) {
+		int rct, dc; /* roles at crash time */
+
+		if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) {
+
+			if (mdev->agreed_pro_version < 91)
+				return -1001;
+
+			if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
+			    (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
+				dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n");
+				drbd_uuid_set_bm(mdev, 0UL);
+
+				drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
+					       mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
+				*rule_nr = 34;
+			} else {
+				dev_info(DEV, "was SyncSource (peer failed to write sync_uuid)\n");
+				*rule_nr = 36;
+			}
+
+			return 1;
+		}
+
+		if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) {
+
+			if (mdev->agreed_pro_version < 91)
+				return -1001;
+
+			if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) &&
+			    (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
+				dev_info(DEV, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
+
+				mdev->p_uuid[UI_HISTORY_START + 1] = mdev->p_uuid[UI_HISTORY_START];
+				mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_BITMAP];
+				mdev->p_uuid[UI_BITMAP] = 0UL;
+
+				drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
+				*rule_nr = 35;
+			} else {
+				dev_info(DEV, "was SyncTarget (failed to write sync_uuid)\n");
+				*rule_nr = 37;
+			}
+
+			return -1;
+		}
+
+		/* Common power [off|failure] */
+		rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) +
+			(mdev->p_uuid[UI_FLAGS] & 2);
+		/* lowest bit is set when we were primary,
+		 * next bit (weight 2) is set when peer was primary */
+		*rule_nr = 40;
+
+		switch (rct) {
+		case 0: /* !self_pri && !peer_pri */ return 0;
+		case 1: /*  self_pri && !peer_pri */ return 1;
+		case 2: /* !self_pri &&  peer_pri */ return -1;
+		case 3: /*  self_pri &&  peer_pri */
+			dc = test_bit(DISCARD_CONCURRENT, &mdev->flags);
+			return dc ? -1 : 1;
+		}
+	}
+
+	*rule_nr = 50;
+	peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
+	if (self == peer)
+		return -1;
+
+	*rule_nr = 51;
+	peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
+	if (self == peer) {
+		self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
+		peer = mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1);
+		if (self == peer) {
+			/* The last P_SYNC_UUID did not get though. Undo the last start of
+			   resync as sync source modifications of the peer's UUIDs. */
+
+			if (mdev->agreed_pro_version < 91)
+				return -1001;
+
+			mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
+			mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
+			return -1;
+		}
+	}
+
+	*rule_nr = 60;
+	self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
+	for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+		peer = mdev->p_uuid[i] & ~((u64)1);
+		if (self == peer)
+			return -2;
+	}
+
+	*rule_nr = 70;
+	self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
+	peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
+	if (self == peer)
+		return 1;
+
+	*rule_nr = 71;
+	self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
+	if (self == peer) {
+		self = mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1);
+		peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
+		if (self == peer) {
+			/* The last P_SYNC_UUID did not get though. Undo the last start of
+			   resync as sync source modifications of our UUIDs. */
+
+			if (mdev->agreed_pro_version < 91)
+				return -1001;
+
+			_drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
+			_drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]);
+
+			dev_info(DEV, "Undid last start of resync:\n");
+
+			drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
+				       mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
+
+			return 1;
+		}
+	}
+
+
+	*rule_nr = 80;
+	for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+		self = mdev->ldev->md.uuid[i] & ~((u64)1);
+		if (self == peer)
+			return 2;
+	}
+
+	*rule_nr = 90;
+	self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
+	peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
+	if (self == peer && self != ((u64)0))
+		return 100;
+
+	*rule_nr = 100;
+	for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+		self = mdev->ldev->md.uuid[i] & ~((u64)1);
+		for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
+			peer = mdev->p_uuid[j] & ~((u64)1);
+			if (self == peer)
+				return -100;
+		}
+	}
+
+	return -1000;
+}
+
+/* drbd_sync_handshake() returns the new conn state on success, or
+   CONN_MASK (-1) on failure.
+ */
+static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role,
+					   enum drbd_disk_state peer_disk) __must_hold(local)
+{
+	int hg, rule_nr;
+	enum drbd_conns rv = C_MASK;
+	enum drbd_disk_state mydisk;
+
+	mydisk = mdev->state.disk;
+	if (mydisk == D_NEGOTIATING)
+		mydisk = mdev->new_state_tmp.disk;
+
+	dev_info(DEV, "drbd_sync_handshake:\n");
+	drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0);
+	drbd_uuid_dump(mdev, "peer", mdev->p_uuid,
+		       mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
+
+	hg = drbd_uuid_compare(mdev, &rule_nr);
+
+	dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
+
+	if (hg == -1000) {
+		dev_alert(DEV, "Unrelated data, aborting!\n");
+		return C_MASK;
+	}
+	if (hg == -1001) {
+		dev_alert(DEV, "To resolve this both sides have to support at least protocol\n");
+		return C_MASK;
+	}
+
+	if    ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
+	    (peer_disk == D_INCONSISTENT && mydisk    > D_INCONSISTENT)) {
+		int f = (hg == -100) || abs(hg) == 2;
+		hg = mydisk > D_INCONSISTENT ? 1 : -1;
+		if (f)
+			hg = hg*2;
+		dev_info(DEV, "Becoming sync %s due to disk states.\n",
+		     hg > 0 ? "source" : "target");
+	}
+
+	if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) {
+		int pcount = (mdev->state.role == R_PRIMARY)
+			   + (peer_role == R_PRIMARY);
+		int forced = (hg == -100);
+
+		switch (pcount) {
+		case 0:
+			hg = drbd_asb_recover_0p(mdev);
+			break;
+		case 1:
+			hg = drbd_asb_recover_1p(mdev);
+			break;
+		case 2:
+			hg = drbd_asb_recover_2p(mdev);
+			break;
+		}
+		if (abs(hg) < 100) {
+			dev_warn(DEV, "Split-Brain detected, %d primaries, "
+			     "automatically solved. Sync from %s node\n",
+			     pcount, (hg < 0) ? "peer" : "this");
+			if (forced) {
+				dev_warn(DEV, "Doing a full sync, since"
+				     " UUIDs where ambiguous.\n");
+				hg = hg*2;
+			}
+		}
+	}
+
+	if (hg == -100) {
+		if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1))
+			hg = -1;
+		if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1))
+			hg = 1;
+
+		if (abs(hg) < 100)
+			dev_warn(DEV, "Split-Brain detected, manually solved. "
+			     "Sync from %s node\n",
+			     (hg < 0) ? "peer" : "this");
+	}
+
+	if (hg == -100) {
+		dev_alert(DEV, "Split-Brain detected, dropping connection!\n");
+		drbd_khelper(mdev, "split-brain");
+		return C_MASK;
+	}
+
+	if (hg > 0 && mydisk <= D_INCONSISTENT) {
+		dev_err(DEV, "I shall become SyncSource, but I am inconsistent!\n");
+		return C_MASK;
+	}
+
+	if (hg < 0 && /* by intention we do not use mydisk here. */
+	    mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) {
+		switch (mdev->net_conf->rr_conflict) {
+		case ASB_CALL_HELPER:
+			drbd_khelper(mdev, "pri-lost");
+			/* fall through */
+		case ASB_DISCONNECT:
+			dev_err(DEV, "I shall become SyncTarget, but I am primary!\n");
+			return C_MASK;
+		case ASB_VIOLENTLY:
+			dev_warn(DEV, "Becoming SyncTarget, violating the stable-data"
+			     "assumption\n");
+		}
+	}
+
+	if (abs(hg) >= 2) {
+		dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
+		if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake"))
+			return C_MASK;
+	}
+
+	if (hg > 0) { /* become sync source. */
+		rv = C_WF_BITMAP_S;
+	} else if (hg < 0) { /* become sync target */
+		rv = C_WF_BITMAP_T;
+	} else {
+		rv = C_CONNECTED;
+		if (drbd_bm_total_weight(mdev)) {
+			dev_info(DEV, "No resync, but %lu bits in bitmap!\n",
+			     drbd_bm_total_weight(mdev));
+		}
+	}
+
+	return rv;
+}
+
+/* returns 1 if invalid */
+static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
+{
+	/* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
+	if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) ||
+	    (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL))
+		return 0;
+
+	/* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
+	if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL ||
+	    self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL)
+		return 1;
+
+	/* everything else is valid if they are equal on both sides. */
+	if (peer == self)
+		return 0;
+
+	/* everything es is invalid. */
+	return 1;
+}
+
+static int receive_protocol(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_protocol *p = (struct p_protocol *)h;
+	int header_size, data_size;
+	int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
+	int p_want_lose, p_two_primaries;
+	char p_integrity_alg[SHARED_SECRET_MAX] = "";
+
+	header_size = sizeof(*p) - sizeof(*h);
+	data_size   = h->length  - header_size;
+
+	if (drbd_recv(mdev, h->payload, header_size) != header_size)
+		return FALSE;
+
+	p_proto		= be32_to_cpu(p->protocol);
+	p_after_sb_0p	= be32_to_cpu(p->after_sb_0p);
+	p_after_sb_1p	= be32_to_cpu(p->after_sb_1p);
+	p_after_sb_2p	= be32_to_cpu(p->after_sb_2p);
+	p_want_lose	= be32_to_cpu(p->want_lose);
+	p_two_primaries = be32_to_cpu(p->two_primaries);
+
+	if (p_proto != mdev->net_conf->wire_protocol) {
+		dev_err(DEV, "incompatible communication protocols\n");
+		goto disconnect;
+	}
+
+	if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) {
+		dev_err(DEV, "incompatible after-sb-0pri settings\n");
+		goto disconnect;
+	}
+
+	if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) {
+		dev_err(DEV, "incompatible after-sb-1pri settings\n");
+		goto disconnect;
+	}
+
+	if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) {
+		dev_err(DEV, "incompatible after-sb-2pri settings\n");
+		goto disconnect;
+	}
+
+	if (p_want_lose && mdev->net_conf->want_lose) {
+		dev_err(DEV, "both sides have the 'want_lose' flag set\n");
+		goto disconnect;
+	}
+
+	if (p_two_primaries != mdev->net_conf->two_primaries) {
+		dev_err(DEV, "incompatible setting of the two-primaries options\n");
+		goto disconnect;
+	}
+
+	if (mdev->agreed_pro_version >= 87) {
+		unsigned char *my_alg = mdev->net_conf->integrity_alg;
+
+		if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size)
+			return FALSE;
+
+		p_integrity_alg[SHARED_SECRET_MAX-1] = 0;
+		if (strcmp(p_integrity_alg, my_alg)) {
+			dev_err(DEV, "incompatible setting of the data-integrity-alg\n");
+			goto disconnect;
+		}
+		dev_info(DEV, "data-integrity-alg: %s\n",
+		     my_alg[0] ? my_alg : (unsigned char *)"<not-used>");
+	}
+
+	return TRUE;
+
+disconnect:
+	drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+	return FALSE;
+}
+
+/* helper function
+ * input: alg name, feature name
+ * return: NULL (alg name was "")
+ *         ERR_PTR(error) if something goes wrong
+ *         or the crypto hash ptr, if it worked out ok. */
+struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
+		const char *alg, const char *name)
+{
+	struct crypto_hash *tfm;
+
+	if (!alg[0])
+		return NULL;
+
+	tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm)) {
+		dev_err(DEV, "Can not allocate \"%s\" as %s (reason: %ld)\n",
+			alg, name, PTR_ERR(tfm));
+		return tfm;
+	}
+	if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
+		crypto_free_hash(tfm);
+		dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name);
+		return ERR_PTR(-EINVAL);
+	}
+	return tfm;
+}
+
+static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
+{
+	int ok = TRUE;
+	struct p_rs_param_89 *p = (struct p_rs_param_89 *)h;
+	unsigned int header_size, data_size, exp_max_sz;
+	struct crypto_hash *verify_tfm = NULL;
+	struct crypto_hash *csums_tfm = NULL;
+	const int apv = mdev->agreed_pro_version;
+
+	exp_max_sz  = apv <= 87 ? sizeof(struct p_rs_param)
+		    : apv == 88 ? sizeof(struct p_rs_param)
+					+ SHARED_SECRET_MAX
+		    : /* 89 */    sizeof(struct p_rs_param_89);
+
+	if (h->length > exp_max_sz) {
+		dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
+		    h->length, exp_max_sz);
+		return FALSE;
+	}
+
+	if (apv <= 88) {
+		header_size = sizeof(struct p_rs_param) - sizeof(*h);
+		data_size   = h->length  - header_size;
+	} else /* apv >= 89 */ {
+		header_size = sizeof(struct p_rs_param_89) - sizeof(*h);
+		data_size   = h->length  - header_size;
+		D_ASSERT(data_size == 0);
+	}
+
+	/* initialize verify_alg and csums_alg */
+	memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
+
+	if (drbd_recv(mdev, h->payload, header_size) != header_size)
+		return FALSE;
+
+	mdev->sync_conf.rate	  = be32_to_cpu(p->rate);
+
+	if (apv >= 88) {
+		if (apv == 88) {
+			if (data_size > SHARED_SECRET_MAX) {
+				dev_err(DEV, "verify-alg too long, "
+				    "peer wants %u, accepting only %u byte\n",
+						data_size, SHARED_SECRET_MAX);
+				return FALSE;
+			}
+
+			if (drbd_recv(mdev, p->verify_alg, data_size) != data_size)
+				return FALSE;
+
+			/* we expect NUL terminated string */
+			/* but just in case someone tries to be evil */
+			D_ASSERT(p->verify_alg[data_size-1] == 0);
+			p->verify_alg[data_size-1] = 0;
+
+		} else /* apv >= 89 */ {
+			/* we still expect NUL terminated strings */
+			/* but just in case someone tries to be evil */
+			D_ASSERT(p->verify_alg[SHARED_SECRET_MAX-1] == 0);
+			D_ASSERT(p->csums_alg[SHARED_SECRET_MAX-1] == 0);
+			p->verify_alg[SHARED_SECRET_MAX-1] = 0;
+			p->csums_alg[SHARED_SECRET_MAX-1] = 0;
+		}
+
+		if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) {
+			if (mdev->state.conn == C_WF_REPORT_PARAMS) {
+				dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
+				    mdev->sync_conf.verify_alg, p->verify_alg);
+				goto disconnect;
+			}
+			verify_tfm = drbd_crypto_alloc_digest_safe(mdev,
+					p->verify_alg, "verify-alg");
+			if (IS_ERR(verify_tfm)) {
+				verify_tfm = NULL;
+				goto disconnect;
+			}
+		}
+
+		if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) {
+			if (mdev->state.conn == C_WF_REPORT_PARAMS) {
+				dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
+				    mdev->sync_conf.csums_alg, p->csums_alg);
+				goto disconnect;
+			}
+			csums_tfm = drbd_crypto_alloc_digest_safe(mdev,
+					p->csums_alg, "csums-alg");
+			if (IS_ERR(csums_tfm)) {
+				csums_tfm = NULL;
+				goto disconnect;
+			}
+		}
+
+
+		spin_lock(&mdev->peer_seq_lock);
+		/* lock against drbd_nl_syncer_conf() */
+		if (verify_tfm) {
+			strcpy(mdev->sync_conf.verify_alg, p->verify_alg);
+			mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1;
+			crypto_free_hash(mdev->verify_tfm);
+			mdev->verify_tfm = verify_tfm;
+			dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
+		}
+		if (csums_tfm) {
+			strcpy(mdev->sync_conf.csums_alg, p->csums_alg);
+			mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1;
+			crypto_free_hash(mdev->csums_tfm);
+			mdev->csums_tfm = csums_tfm;
+			dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
+		}
+		spin_unlock(&mdev->peer_seq_lock);
+	}
+
+	return ok;
+disconnect:
+	/* just for completeness: actually not needed,
+	 * as this is not reached if csums_tfm was ok. */
+	crypto_free_hash(csums_tfm);
+	/* but free the verify_tfm again, if csums_tfm did not work out */
+	crypto_free_hash(verify_tfm);
+	drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+	return FALSE;
+}
+
+static void drbd_setup_order_type(struct drbd_conf *mdev, int peer)
+{
+	/* sorry, we currently have no working implementation
+	 * of distributed TCQ */
+}
+
+/* warn if the arguments differ by more than 12.5% */
+static void warn_if_differ_considerably(struct drbd_conf *mdev,
+	const char *s, sector_t a, sector_t b)
+{
+	sector_t d;
+	if (a == 0 || b == 0)
+		return;
+	d = (a > b) ? (a - b) : (b - a);
+	if (d > (a>>3) || d > (b>>3))
+		dev_warn(DEV, "Considerable difference in %s: %llus vs. %llus\n", s,
+		     (unsigned long long)a, (unsigned long long)b);
+}
+
+static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_sizes *p = (struct p_sizes *)h;
+	enum determine_dev_size dd = unchanged;
+	unsigned int max_seg_s;
+	sector_t p_size, p_usize, my_usize;
+	int ldsc = 0; /* local disk size changed */
+	enum drbd_conns nconn;
+
+	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
+	if (drbd_recv(mdev, h->payload, h->length) != h->length)
+		return FALSE;
+
+	p_size = be64_to_cpu(p->d_size);
+	p_usize = be64_to_cpu(p->u_size);
+
+	if (p_size == 0 && mdev->state.disk == D_DISKLESS) {
+		dev_err(DEV, "some backing storage is needed\n");
+		drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+		return FALSE;
+	}
+
+	/* just store the peer's disk size for now.
+	 * we still need to figure out whether we accept that. */
+	mdev->p_size = p_size;
+
+#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
+	if (get_ldev(mdev)) {
+		warn_if_differ_considerably(mdev, "lower level device sizes",
+			   p_size, drbd_get_max_capacity(mdev->ldev));
+		warn_if_differ_considerably(mdev, "user requested size",
+					    p_usize, mdev->ldev->dc.disk_size);
+
+		/* if this is the first connect, or an otherwise expected
+		 * param exchange, choose the minimum */
+		if (mdev->state.conn == C_WF_REPORT_PARAMS)
+			p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size,
+					     p_usize);
+
+		my_usize = mdev->ldev->dc.disk_size;
+
+		if (mdev->ldev->dc.disk_size != p_usize) {
+			mdev->ldev->dc.disk_size = p_usize;
+			dev_info(DEV, "Peer sets u_size to %lu sectors\n",
+			     (unsigned long)mdev->ldev->dc.disk_size);
+		}
+
+		/* Never shrink a device with usable data during connect.
+		   But allow online shrinking if we are connected. */
+		if (drbd_new_dev_size(mdev, mdev->ldev) <
+		   drbd_get_capacity(mdev->this_bdev) &&
+		   mdev->state.disk >= D_OUTDATED &&
+		   mdev->state.conn < C_CONNECTED) {
+			dev_err(DEV, "The peer's disk size is too small!\n");
+			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+			mdev->ldev->dc.disk_size = my_usize;
+			put_ldev(mdev);
+			return FALSE;
+		}
+		put_ldev(mdev);
+	}
+#undef min_not_zero
+
+	if (get_ldev(mdev)) {
+		dd = drbd_determin_dev_size(mdev);
+		put_ldev(mdev);
+		if (dd == dev_size_error)
+			return FALSE;
+		drbd_md_sync(mdev);
+	} else {
+		/* I am diskless, need to accept the peer's size. */
+		drbd_set_my_capacity(mdev, p_size);
+	}
+
+	if (mdev->p_uuid && mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
+		nconn = drbd_sync_handshake(mdev,
+				mdev->state.peer, mdev->state.pdsk);
+		put_ldev(mdev);
+
+		if (nconn == C_MASK) {
+			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+			return FALSE;
+		}
+
+		if (drbd_request_state(mdev, NS(conn, nconn)) < SS_SUCCESS) {
+			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+			return FALSE;
+		}
+	}
+
+	if (get_ldev(mdev)) {
+		if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
+			mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
+			ldsc = 1;
+		}
+
+		max_seg_s = be32_to_cpu(p->max_segment_size);
+		if (max_seg_s != queue_max_segment_size(mdev->rq_queue))
+			drbd_setup_queue_param(mdev, max_seg_s);
+
+		drbd_setup_order_type(mdev, be32_to_cpu(p->queue_order_type));
+		put_ldev(mdev);
+	}
+
+	if (mdev->state.conn > C_WF_REPORT_PARAMS) {
+		if (be64_to_cpu(p->c_size) !=
+		    drbd_get_capacity(mdev->this_bdev) || ldsc) {
+			/* we have different sizes, probably peer
+			 * needs to know my new size... */
+			drbd_send_sizes(mdev, 0);
+		}
+		if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
+		    (dd == grew && mdev->state.conn == C_CONNECTED)) {
+			if (mdev->state.pdsk >= D_INCONSISTENT &&
+			    mdev->state.disk >= D_INCONSISTENT)
+				resync_after_online_grow(mdev);
+			else
+				set_bit(RESYNC_AFTER_NEG, &mdev->flags);
+		}
+	}
+
+	return TRUE;
+}
+
+static int receive_uuids(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_uuids *p = (struct p_uuids *)h;
+	u64 *p_uuid;
+	int i;
+
+	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
+	if (drbd_recv(mdev, h->payload, h->length) != h->length)
+		return FALSE;
+
+	p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
+
+	for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
+		p_uuid[i] = be64_to_cpu(p->uuid[i]);
+
+	kfree(mdev->p_uuid);
+	mdev->p_uuid = p_uuid;
+
+	if (mdev->state.conn < C_CONNECTED &&
+	    mdev->state.disk < D_INCONSISTENT &&
+	    mdev->state.role == R_PRIMARY &&
+	    (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
+		dev_err(DEV, "Can only connect to data with current UUID=%016llX\n",
+		    (unsigned long long)mdev->ed_uuid);
+		drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+		return FALSE;
+	}
+
+	if (get_ldev(mdev)) {
+		int skip_initial_sync =
+			mdev->state.conn == C_CONNECTED &&
+			mdev->agreed_pro_version >= 90 &&
+			mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
+			(p_uuid[UI_FLAGS] & 8);
+		if (skip_initial_sync) {
+			dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n");
+			drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
+					"clear_n_write from receive_uuids");
+			_drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]);
+			_drbd_uuid_set(mdev, UI_BITMAP, 0);
+			_drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
+					CS_VERBOSE, NULL);
+			drbd_md_sync(mdev);
+		}
+		put_ldev(mdev);
+	}
+
+	/* Before we test for the disk state, we should wait until an eventually
+	   ongoing cluster wide state change is finished. That is important if
+	   we are primary and are detaching from our disk. We need to see the
+	   new disk state... */
+	wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags));
+	if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT)
+		drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
+
+	return TRUE;
+}
+
+/**
+ * convert_state() - Converts the peer's view of the cluster state to our point of view
+ * @ps:		The state as seen by the peer.
+ */
+static union drbd_state convert_state(union drbd_state ps)
+{
+	union drbd_state ms;
+
+	static enum drbd_conns c_tab[] = {
+		[C_CONNECTED] = C_CONNECTED,
+
+		[C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
+		[C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
+		[C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
+		[C_VERIFY_S]       = C_VERIFY_T,
+		[C_MASK]   = C_MASK,
+	};
+
+	ms.i = ps.i;
+
+	ms.conn = c_tab[ps.conn];
+	ms.peer = ps.role;
+	ms.role = ps.peer;
+	ms.pdsk = ps.disk;
+	ms.disk = ps.pdsk;
+	ms.peer_isp = (ps.aftr_isp | ps.user_isp);
+
+	return ms;
+}
+
+static int receive_req_state(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_req_state *p = (struct p_req_state *)h;
+	union drbd_state mask, val;
+	int rv;
+
+	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
+	if (drbd_recv(mdev, h->payload, h->length) != h->length)
+		return FALSE;
+
+	mask.i = be32_to_cpu(p->mask);
+	val.i = be32_to_cpu(p->val);
+
+	if (test_bit(DISCARD_CONCURRENT, &mdev->flags) &&
+	    test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) {
+		drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
+		return TRUE;
+	}
+
+	mask = convert_state(mask);
+	val = convert_state(val);
+
+	rv = drbd_change_state(mdev, CS_VERBOSE, mask, val);
+
+	drbd_send_sr_reply(mdev, rv);
+	drbd_md_sync(mdev);
+
+	return TRUE;
+}
+
+static int receive_state(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_state *p = (struct p_state *)h;
+	enum drbd_conns nconn, oconn;
+	union drbd_state ns, peer_state;
+	enum drbd_disk_state real_peer_disk;
+	int rv;
+
+	ERR_IF(h->length != (sizeof(*p)-sizeof(*h)))
+		return FALSE;
+
+	if (drbd_recv(mdev, h->payload, h->length) != h->length)
+		return FALSE;
+
+	peer_state.i = be32_to_cpu(p->state);
+
+	real_peer_disk = peer_state.disk;
+	if (peer_state.disk == D_NEGOTIATING) {
+		real_peer_disk = mdev->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
+		dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
+	}
+
+	spin_lock_irq(&mdev->req_lock);
+ retry:
+	oconn = nconn = mdev->state.conn;
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (nconn == C_WF_REPORT_PARAMS)
+		nconn = C_CONNECTED;
+
+	if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
+	    get_ldev_if_state(mdev, D_NEGOTIATING)) {
+		int cr; /* consider resync */
+
+		/* if we established a new connection */
+		cr  = (oconn < C_CONNECTED);
+		/* if we had an established connection
+		 * and one of the nodes newly attaches a disk */
+		cr |= (oconn == C_CONNECTED &&
+		       (peer_state.disk == D_NEGOTIATING ||
+			mdev->state.disk == D_NEGOTIATING));
+		/* if we have both been inconsistent, and the peer has been
+		 * forced to be UpToDate with --overwrite-data */
+		cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
+		/* if we had been plain connected, and the admin requested to
+		 * start a sync by "invalidate" or "invalidate-remote" */
+		cr |= (oconn == C_CONNECTED &&
+				(peer_state.conn >= C_STARTING_SYNC_S &&
+				 peer_state.conn <= C_WF_BITMAP_T));
+
+		if (cr)
+			nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
+
+		put_ldev(mdev);
+		if (nconn == C_MASK) {
+			if (mdev->state.disk == D_NEGOTIATING) {
+				drbd_force_state(mdev, NS(disk, D_DISKLESS));
+				nconn = C_CONNECTED;
+			} else if (peer_state.disk == D_NEGOTIATING) {
+				dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
+				peer_state.disk = D_DISKLESS;
+			} else {
+				D_ASSERT(oconn == C_WF_REPORT_PARAMS);
+				drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+				return FALSE;
+			}
+		}
+	}
+
+	spin_lock_irq(&mdev->req_lock);
+	if (mdev->state.conn != oconn)
+		goto retry;
+	clear_bit(CONSIDER_RESYNC, &mdev->flags);
+	ns.i = mdev->state.i;
+	ns.conn = nconn;
+	ns.peer = peer_state.role;
+	ns.pdsk = real_peer_disk;
+	ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
+	if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
+		ns.disk = mdev->new_state_tmp.disk;
+
+	rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL);
+	ns = mdev->state;
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (rv < SS_SUCCESS) {
+		drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+		return FALSE;
+	}
+
+	if (oconn > C_WF_REPORT_PARAMS) {
+		if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
+		    peer_state.disk != D_NEGOTIATING ) {
+			/* we want resync, peer has not yet decided to sync... */
+			/* Nowadays only used when forcing a node into primary role and
+			   setting its disk to UpToDate with that */
+			drbd_send_uuids(mdev);
+			drbd_send_state(mdev);
+		}
+	}
+
+	mdev->net_conf->want_lose = 0;
+
+	drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
+
+	return TRUE;
+}
+
+static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_rs_uuid *p = (struct p_rs_uuid *)h;
+
+	wait_event(mdev->misc_wait,
+		   mdev->state.conn == C_WF_SYNC_UUID ||
+		   mdev->state.conn < C_CONNECTED ||
+		   mdev->state.disk < D_NEGOTIATING);
+
+	/* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
+
+	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
+	if (drbd_recv(mdev, h->payload, h->length) != h->length)
+		return FALSE;
+
+	/* Here the _drbd_uuid_ functions are right, current should
+	   _not_ be rotated into the history */
+	if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
+		_drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid));
+		_drbd_uuid_set(mdev, UI_BITMAP, 0UL);
+
+		drbd_start_resync(mdev, C_SYNC_TARGET);
+
+		put_ldev(mdev);
+	} else
+		dev_err(DEV, "Ignoring SyncUUID packet!\n");
+
+	return TRUE;
+}
+
+enum receive_bitmap_ret { OK, DONE, FAILED };
+
+static enum receive_bitmap_ret
+receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h,
+	unsigned long *buffer, struct bm_xfer_ctx *c)
+{
+	unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
+	unsigned want = num_words * sizeof(long);
+
+	if (want != h->length) {
+		dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length);
+		return FAILED;
+	}
+	if (want == 0)
+		return DONE;
+	if (drbd_recv(mdev, buffer, want) != want)
+		return FAILED;
+
+	drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer);
+
+	c->word_offset += num_words;
+	c->bit_offset = c->word_offset * BITS_PER_LONG;
+	if (c->bit_offset > c->bm_bits)
+		c->bit_offset = c->bm_bits;
+
+	return OK;
+}
+
+static enum receive_bitmap_ret
+recv_bm_rle_bits(struct drbd_conf *mdev,
+		struct p_compressed_bm *p,
+		struct bm_xfer_ctx *c)
+{
+	struct bitstream bs;
+	u64 look_ahead;
+	u64 rl;
+	u64 tmp;
+	unsigned long s = c->bit_offset;
+	unsigned long e;
+	int len = p->head.length - (sizeof(*p) - sizeof(p->head));
+	int toggle = DCBP_get_start(p);
+	int have;
+	int bits;
+
+	bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p));
+
+	bits = bitstream_get_bits(&bs, &look_ahead, 64);
+	if (bits < 0)
+		return FAILED;
+
+	for (have = bits; have > 0; s += rl, toggle = !toggle) {
+		bits = vli_decode_bits(&rl, look_ahead);
+		if (bits <= 0)
+			return FAILED;
+
+		if (toggle) {
+			e = s + rl -1;
+			if (e >= c->bm_bits) {
+				dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
+				return FAILED;
+			}
+			_drbd_bm_set_bits(mdev, s, e);
+		}
+
+		if (have < bits) {
+			dev_err(DEV, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
+				have, bits, look_ahead,
+				(unsigned int)(bs.cur.b - p->code),
+				(unsigned int)bs.buf_len);
+			return FAILED;
+		}
+		look_ahead >>= bits;
+		have -= bits;
+
+		bits = bitstream_get_bits(&bs, &tmp, 64 - have);
+		if (bits < 0)
+			return FAILED;
+		look_ahead |= tmp << have;
+		have += bits;
+	}
+
+	c->bit_offset = s;
+	bm_xfer_ctx_bit_to_word_offset(c);
+
+	return (s == c->bm_bits) ? DONE : OK;
+}
+
+static enum receive_bitmap_ret
+decode_bitmap_c(struct drbd_conf *mdev,
+		struct p_compressed_bm *p,
+		struct bm_xfer_ctx *c)
+{
+	if (DCBP_get_code(p) == RLE_VLI_Bits)
+		return recv_bm_rle_bits(mdev, p, c);
+
+	/* other variants had been implemented for evaluation,
+	 * but have been dropped as this one turned out to be "best"
+	 * during all our tests. */
+
+	dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
+	drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+	return FAILED;
+}
+
+void INFO_bm_xfer_stats(struct drbd_conf *mdev,
+		const char *direction, struct bm_xfer_ctx *c)
+{
+	/* what would it take to transfer it "plaintext" */
+	unsigned plain = sizeof(struct p_header) *
+		((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
+		+ c->bm_words * sizeof(long);
+	unsigned total = c->bytes[0] + c->bytes[1];
+	unsigned r;
+
+	/* total can not be zero. but just in case: */
+	if (total == 0)
+		return;
+
+	/* don't report if not compressed */
+	if (total >= plain)
+		return;
+
+	/* total < plain. check for overflow, still */
+	r = (total > UINT_MAX/1000) ? (total / (plain/1000))
+		                    : (1000 * total / plain);
+
+	if (r > 1000)
+		r = 1000;
+
+	r = 1000 - r;
+	dev_info(DEV, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
+	     "total %u; compression: %u.%u%%\n",
+			direction,
+			c->bytes[1], c->packets[1],
+			c->bytes[0], c->packets[0],
+			total, r/10, r % 10);
+}
+
+/* Since we are processing the bitfield from lower addresses to higher,
+   it does not matter if the process it in 32 bit chunks or 64 bit
+   chunks as long as it is little endian. (Understand it as byte stream,
+   beginning with the lowest byte...) If we would use big endian
+   we would need to process it from the highest address to the lowest,
+   in order to be agnostic to the 32 vs 64 bits issue.
+
+   returns 0 on failure, 1 if we successfully received it. */
+static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct bm_xfer_ctx c;
+	void *buffer;
+	enum receive_bitmap_ret ret;
+	int ok = FALSE;
+
+	wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
+
+	drbd_bm_lock(mdev, "receive bitmap");
+
+	/* maybe we should use some per thread scratch page,
+	 * and allocate that during initial device creation? */
+	buffer	 = (unsigned long *) __get_free_page(GFP_NOIO);
+	if (!buffer) {
+		dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
+		goto out;
+	}
+
+	c = (struct bm_xfer_ctx) {
+		.bm_bits = drbd_bm_bits(mdev),
+		.bm_words = drbd_bm_words(mdev),
+	};
+
+	do {
+		if (h->command == P_BITMAP) {
+			ret = receive_bitmap_plain(mdev, h, buffer, &c);
+		} else if (h->command == P_COMPRESSED_BITMAP) {
+			/* MAYBE: sanity check that we speak proto >= 90,
+			 * and the feature is enabled! */
+			struct p_compressed_bm *p;
+
+			if (h->length > BM_PACKET_PAYLOAD_BYTES) {
+				dev_err(DEV, "ReportCBitmap packet too large\n");
+				goto out;
+			}
+			/* use the page buff */
+			p = buffer;
+			memcpy(p, h, sizeof(*h));
+			if (drbd_recv(mdev, p->head.payload, h->length) != h->length)
+				goto out;
+			if (p->head.length <= (sizeof(*p) - sizeof(p->head))) {
+				dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length);
+				return FAILED;
+			}
+			ret = decode_bitmap_c(mdev, p, &c);
+		} else {
+			dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command);
+			goto out;
+		}
+
+		c.packets[h->command == P_BITMAP]++;
+		c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length;
+
+		if (ret != OK)
+			break;
+
+		if (!drbd_recv_header(mdev, h))
+			goto out;
+	} while (ret == OK);
+	if (ret == FAILED)
+		goto out;
+
+	INFO_bm_xfer_stats(mdev, "receive", &c);
+
+	if (mdev->state.conn == C_WF_BITMAP_T) {
+		ok = !drbd_send_bitmap(mdev);
+		if (!ok)
+			goto out;
+		/* Omit CS_ORDERED with this state transition to avoid deadlocks. */
+		ok = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
+		D_ASSERT(ok == SS_SUCCESS);
+	} else if (mdev->state.conn != C_WF_BITMAP_S) {
+		/* admin may have requested C_DISCONNECTING,
+		 * other threads may have noticed network errors */
+		dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n",
+		    drbd_conn_str(mdev->state.conn));
+	}
+
+	ok = TRUE;
+ out:
+	drbd_bm_unlock(mdev);
+	if (ok && mdev->state.conn == C_WF_BITMAP_S)
+		drbd_start_resync(mdev, C_SYNC_SOURCE);
+	free_page((unsigned long) buffer);
+	return ok;
+}
+
+static int receive_skip(struct drbd_conf *mdev, struct p_header *h)
+{
+	/* TODO zero copy sink :) */
+	static char sink[128];
+	int size, want, r;
+
+	dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
+	     h->command, h->length);
+
+	size = h->length;
+	while (size > 0) {
+		want = min_t(int, size, sizeof(sink));
+		r = drbd_recv(mdev, sink, want);
+		ERR_IF(r <= 0) break;
+		size -= r;
+	}
+	return size == 0;
+}
+
+static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
+{
+	if (mdev->state.disk >= D_INCONSISTENT)
+		drbd_kick_lo(mdev);
+
+	/* Make sure we've acked all the TCP data associated
+	 * with the data requests being unplugged */
+	drbd_tcp_quickack(mdev->data.socket);
+
+	return TRUE;
+}
+
+typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *);
+
+static drbd_cmd_handler_f drbd_default_handler[] = {
+	[P_DATA]	    = receive_Data,
+	[P_DATA_REPLY]	    = receive_DataReply,
+	[P_RS_DATA_REPLY]   = receive_RSDataReply,
+	[P_BARRIER]	    = receive_Barrier,
+	[P_BITMAP]	    = receive_bitmap,
+	[P_COMPRESSED_BITMAP]    = receive_bitmap,
+	[P_UNPLUG_REMOTE]   = receive_UnplugRemote,
+	[P_DATA_REQUEST]    = receive_DataRequest,
+	[P_RS_DATA_REQUEST] = receive_DataRequest,
+	[P_SYNC_PARAM]	    = receive_SyncParam,
+	[P_SYNC_PARAM89]	   = receive_SyncParam,
+	[P_PROTOCOL]        = receive_protocol,
+	[P_UUIDS]	    = receive_uuids,
+	[P_SIZES]	    = receive_sizes,
+	[P_STATE]	    = receive_state,
+	[P_STATE_CHG_REQ]   = receive_req_state,
+	[P_SYNC_UUID]       = receive_sync_uuid,
+	[P_OV_REQUEST]      = receive_DataRequest,
+	[P_OV_REPLY]        = receive_DataRequest,
+	[P_CSUM_RS_REQUEST]    = receive_DataRequest,
+	/* anything missing from this table is in
+	 * the asender_tbl, see get_asender_cmd */
+	[P_MAX_CMD]	    = NULL,
+};
+
+static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler;
+static drbd_cmd_handler_f *drbd_opt_cmd_handler;
+
+static void drbdd(struct drbd_conf *mdev)
+{
+	drbd_cmd_handler_f handler;
+	struct p_header *header = &mdev->data.rbuf.header;
+
+	while (get_t_state(&mdev->receiver) == Running) {
+		drbd_thread_current_set_cpu(mdev);
+		if (!drbd_recv_header(mdev, header))
+			break;
+
+		if (header->command < P_MAX_CMD)
+			handler = drbd_cmd_handler[header->command];
+		else if (P_MAY_IGNORE < header->command
+		     && header->command < P_MAX_OPT_CMD)
+			handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE];
+		else if (header->command > P_MAX_OPT_CMD)
+			handler = receive_skip;
+		else
+			handler = NULL;
+
+		if (unlikely(!handler)) {
+			dev_err(DEV, "unknown packet type %d, l: %d!\n",
+			    header->command, header->length);
+			drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+			break;
+		}
+		if (unlikely(!handler(mdev, header))) {
+			dev_err(DEV, "error receiving %s, l: %d!\n",
+			    cmdname(header->command), header->length);
+			drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+			break;
+		}
+
+		trace_drbd_packet(mdev, mdev->data.socket, 2, &mdev->data.rbuf,
+				__FILE__, __LINE__);
+	}
+}
+
+static void drbd_fail_pending_reads(struct drbd_conf *mdev)
+{
+	struct hlist_head *slot;
+	struct hlist_node *pos;
+	struct hlist_node *tmp;
+	struct drbd_request *req;
+	int i;
+
+	/*
+	 * Application READ requests
+	 */
+	spin_lock_irq(&mdev->req_lock);
+	for (i = 0; i < APP_R_HSIZE; i++) {
+		slot = mdev->app_reads_hash+i;
+		hlist_for_each_entry_safe(req, pos, tmp, slot, colision) {
+			/* it may (but should not any longer!)
+			 * be on the work queue; if that assert triggers,
+			 * we need to also grab the
+			 * spin_lock_irq(&mdev->data.work.q_lock);
+			 * and list_del_init here. */
+			D_ASSERT(list_empty(&req->w.list));
+			/* It would be nice to complete outside of spinlock.
+			 * But this is easier for now. */
+			_req_mod(req, connection_lost_while_pending);
+		}
+	}
+	for (i = 0; i < APP_R_HSIZE; i++)
+		if (!hlist_empty(mdev->app_reads_hash+i))
+			dev_warn(DEV, "ASSERT FAILED: app_reads_hash[%d].first: "
+				"%p, should be NULL\n", i, mdev->app_reads_hash[i].first);
+
+	memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
+	spin_unlock_irq(&mdev->req_lock);
+}
+
+void drbd_flush_workqueue(struct drbd_conf *mdev)
+{
+	struct drbd_wq_barrier barr;
+
+	barr.w.cb = w_prev_work_done;
+	init_completion(&barr.done);
+	drbd_queue_work(&mdev->data.work, &barr.w);
+	wait_for_completion(&barr.done);
+}
+
+static void drbd_disconnect(struct drbd_conf *mdev)
+{
+	enum drbd_fencing_p fp;
+	union drbd_state os, ns;
+	int rv = SS_UNKNOWN_ERROR;
+	unsigned int i;
+
+	if (mdev->state.conn == C_STANDALONE)
+		return;
+	if (mdev->state.conn >= C_WF_CONNECTION)
+		dev_err(DEV, "ASSERT FAILED cstate = %s, expected < WFConnection\n",
+				drbd_conn_str(mdev->state.conn));
+
+	/* asender does not clean up anything. it must not interfere, either */
+	drbd_thread_stop(&mdev->asender);
+
+	mutex_lock(&mdev->data.mutex);
+	drbd_free_sock(mdev);
+	mutex_unlock(&mdev->data.mutex);
+
+	spin_lock_irq(&mdev->req_lock);
+	_drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
+	_drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
+	_drbd_wait_ee_list_empty(mdev, &mdev->read_ee);
+	spin_unlock_irq(&mdev->req_lock);
+
+	/* We do not have data structures that would allow us to
+	 * get the rs_pending_cnt down to 0 again.
+	 *  * On C_SYNC_TARGET we do not have any data structures describing
+	 *    the pending RSDataRequest's we have sent.
+	 *  * On C_SYNC_SOURCE there is no data structure that tracks
+	 *    the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
+	 *  And no, it is not the sum of the reference counts in the
+	 *  resync_LRU. The resync_LRU tracks the whole operation including
+	 *  the disk-IO, while the rs_pending_cnt only tracks the blocks
+	 *  on the fly. */
+	drbd_rs_cancel_all(mdev);
+	mdev->rs_total = 0;
+	mdev->rs_failed = 0;
+	atomic_set(&mdev->rs_pending_cnt, 0);
+	wake_up(&mdev->misc_wait);
+
+	/* make sure syncer is stopped and w_resume_next_sg queued */
+	del_timer_sync(&mdev->resync_timer);
+	set_bit(STOP_SYNC_TIMER, &mdev->flags);
+	resync_timer_fn((unsigned long)mdev);
+
+	/* so we can be sure that all remote or resync reads
+	 * made it at least to net_ee */
+	wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
+
+	/* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
+	 * w_make_resync_request etc. which may still be on the worker queue
+	 * to be "canceled" */
+	drbd_flush_workqueue(mdev);
+
+	/* This also does reclaim_net_ee().  If we do this too early, we might
+	 * miss some resync ee and pages.*/
+	drbd_process_done_ee(mdev);
+
+	kfree(mdev->p_uuid);
+	mdev->p_uuid = NULL;
+
+	if (!mdev->state.susp)
+		tl_clear(mdev);
+
+	drbd_fail_pending_reads(mdev);
+
+	dev_info(DEV, "Connection closed\n");
+
+	drbd_md_sync(mdev);
+
+	fp = FP_DONT_CARE;
+	if (get_ldev(mdev)) {
+		fp = mdev->ldev->dc.fencing;
+		put_ldev(mdev);
+	}
+
+	if (mdev->state.role == R_PRIMARY) {
+		if (fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) {
+			enum drbd_disk_state nps = drbd_try_outdate_peer(mdev);
+			drbd_request_state(mdev, NS(pdsk, nps));
+		}
+	}
+
+	spin_lock_irq(&mdev->req_lock);
+	os = mdev->state;
+	if (os.conn >= C_UNCONNECTED) {
+		/* Do not restart in case we are C_DISCONNECTING */
+		ns = os;
+		ns.conn = C_UNCONNECTED;
+		rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
+	}
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (os.conn == C_DISCONNECTING) {
+		struct hlist_head *h;
+		wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0);
+
+		/* we must not free the tl_hash
+		 * while application io is still on the fly */
+		wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0);
+
+		spin_lock_irq(&mdev->req_lock);
+		/* paranoia code */
+		for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
+			if (h->first)
+				dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
+						(int)(h - mdev->ee_hash), h->first);
+		kfree(mdev->ee_hash);
+		mdev->ee_hash = NULL;
+		mdev->ee_hash_s = 0;
+
+		/* paranoia code */
+		for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
+			if (h->first)
+				dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
+						(int)(h - mdev->tl_hash), h->first);
+		kfree(mdev->tl_hash);
+		mdev->tl_hash = NULL;
+		mdev->tl_hash_s = 0;
+		spin_unlock_irq(&mdev->req_lock);
+
+		crypto_free_hash(mdev->cram_hmac_tfm);
+		mdev->cram_hmac_tfm = NULL;
+
+		kfree(mdev->net_conf);
+		mdev->net_conf = NULL;
+		drbd_request_state(mdev, NS(conn, C_STANDALONE));
+	}
+
+	/* tcp_close and release of sendpage pages can be deferred.  I don't
+	 * want to use SO_LINGER, because apparently it can be deferred for
+	 * more than 20 seconds (longest time I checked).
+	 *
+	 * Actually we don't care for exactly when the network stack does its
+	 * put_page(), but release our reference on these pages right here.
+	 */
+	i = drbd_release_ee(mdev, &mdev->net_ee);
+	if (i)
+		dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
+	i = atomic_read(&mdev->pp_in_use);
+	if (i)
+		dev_info(DEV, "pp_in_use = %u, expected 0\n", i);
+
+	D_ASSERT(list_empty(&mdev->read_ee));
+	D_ASSERT(list_empty(&mdev->active_ee));
+	D_ASSERT(list_empty(&mdev->sync_ee));
+	D_ASSERT(list_empty(&mdev->done_ee));
+
+	/* ok, no more ee's on the fly, it is safe to reset the epoch_size */
+	atomic_set(&mdev->current_epoch->epoch_size, 0);
+	D_ASSERT(list_empty(&mdev->current_epoch->list));
+}
+
+/*
+ * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
+ * we can agree on is stored in agreed_pro_version.
+ *
+ * feature flags and the reserved array should be enough room for future
+ * enhancements of the handshake protocol, and possible plugins...
+ *
+ * for now, they are expected to be zero, but ignored.
+ */
+static int drbd_send_handshake(struct drbd_conf *mdev)
+{
+	/* ASSERT current == mdev->receiver ... */
+	struct p_handshake *p = &mdev->data.sbuf.handshake;
+	int ok;
+
+	if (mutex_lock_interruptible(&mdev->data.mutex)) {
+		dev_err(DEV, "interrupted during initial handshake\n");
+		return 0; /* interrupted. not ok. */
+	}
+
+	if (mdev->data.socket == NULL) {
+		mutex_unlock(&mdev->data.mutex);
+		return 0;
+	}
+
+	memset(p, 0, sizeof(*p));
+	p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
+	p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
+	ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
+			     (struct p_header *)p, sizeof(*p), 0 );
+	mutex_unlock(&mdev->data.mutex);
+	return ok;
+}
+
+/*
+ * return values:
+ *   1 yes, we have a valid connection
+ *   0 oops, did not work out, please try again
+ *  -1 peer talks different language,
+ *     no point in trying again, please go standalone.
+ */
+static int drbd_do_handshake(struct drbd_conf *mdev)
+{
+	/* ASSERT current == mdev->receiver ... */
+	struct p_handshake *p = &mdev->data.rbuf.handshake;
+	const int expect = sizeof(struct p_handshake)
+			  -sizeof(struct p_header);
+	int rv;
+
+	rv = drbd_send_handshake(mdev);
+	if (!rv)
+		return 0;
+
+	rv = drbd_recv_header(mdev, &p->head);
+	if (!rv)
+		return 0;
+
+	if (p->head.command != P_HAND_SHAKE) {
+		dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
+		     cmdname(p->head.command), p->head.command);
+		return -1;
+	}
+
+	if (p->head.length != expect) {
+		dev_err(DEV, "expected HandShake length: %u, received: %u\n",
+		     expect, p->head.length);
+		return -1;
+	}
+
+	rv = drbd_recv(mdev, &p->head.payload, expect);
+
+	if (rv != expect) {
+		dev_err(DEV, "short read receiving handshake packet: l=%u\n", rv);
+		return 0;
+	}
+
+	trace_drbd_packet(mdev, mdev->data.socket, 2, &mdev->data.rbuf,
+			__FILE__, __LINE__);
+
+	p->protocol_min = be32_to_cpu(p->protocol_min);
+	p->protocol_max = be32_to_cpu(p->protocol_max);
+	if (p->protocol_max == 0)
+		p->protocol_max = p->protocol_min;
+
+	if (PRO_VERSION_MAX < p->protocol_min ||
+	    PRO_VERSION_MIN > p->protocol_max)
+		goto incompat;
+
+	mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
+
+	dev_info(DEV, "Handshake successful: "
+	     "Agreed network protocol version %d\n", mdev->agreed_pro_version);
+
+	return 1;
+
+ incompat:
+	dev_err(DEV, "incompatible DRBD dialects: "
+	    "I support %d-%d, peer supports %d-%d\n",
+	    PRO_VERSION_MIN, PRO_VERSION_MAX,
+	    p->protocol_min, p->protocol_max);
+	return -1;
+}
+
+#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
+static int drbd_do_auth(struct drbd_conf *mdev)
+{
+	dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
+	dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
+	return 0;
+}
+#else
+#define CHALLENGE_LEN 64
+static int drbd_do_auth(struct drbd_conf *mdev)
+{
+	char my_challenge[CHALLENGE_LEN];  /* 64 Bytes... */
+	struct scatterlist sg;
+	char *response = NULL;
+	char *right_response = NULL;
+	char *peers_ch = NULL;
+	struct p_header p;
+	unsigned int key_len = strlen(mdev->net_conf->shared_secret);
+	unsigned int resp_size;
+	struct hash_desc desc;
+	int rv;
+
+	desc.tfm = mdev->cram_hmac_tfm;
+	desc.flags = 0;
+
+	rv = crypto_hash_setkey(mdev->cram_hmac_tfm,
+				(u8 *)mdev->net_conf->shared_secret, key_len);
+	if (rv) {
+		dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv);
+		rv = 0;
+		goto fail;
+	}
+
+	get_random_bytes(my_challenge, CHALLENGE_LEN);
+
+	rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN);
+	if (!rv)
+		goto fail;
+
+	rv = drbd_recv_header(mdev, &p);
+	if (!rv)
+		goto fail;
+
+	if (p.command != P_AUTH_CHALLENGE) {
+		dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
+		    cmdname(p.command), p.command);
+		rv = 0;
+		goto fail;
+	}
+
+	if (p.length > CHALLENGE_LEN*2) {
+		dev_err(DEV, "expected AuthChallenge payload too big.\n");
+		rv = 0;
+		goto fail;
+	}
+
+	peers_ch = kmalloc(p.length, GFP_NOIO);
+	if (peers_ch == NULL) {
+		dev_err(DEV, "kmalloc of peers_ch failed\n");
+		rv = 0;
+		goto fail;
+	}
+
+	rv = drbd_recv(mdev, peers_ch, p.length);
+
+	if (rv != p.length) {
+		dev_err(DEV, "short read AuthChallenge: l=%u\n", rv);
+		rv = 0;
+		goto fail;
+	}
+
+	resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm);
+	response = kmalloc(resp_size, GFP_NOIO);
+	if (response == NULL) {
+		dev_err(DEV, "kmalloc of response failed\n");
+		rv = 0;
+		goto fail;
+	}
+
+	sg_init_table(&sg, 1);
+	sg_set_buf(&sg, peers_ch, p.length);
+
+	rv = crypto_hash_digest(&desc, &sg, sg.length, response);
+	if (rv) {
+		dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
+		rv = 0;
+		goto fail;
+	}
+
+	rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size);
+	if (!rv)
+		goto fail;
+
+	rv = drbd_recv_header(mdev, &p);
+	if (!rv)
+		goto fail;
+
+	if (p.command != P_AUTH_RESPONSE) {
+		dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
+		    cmdname(p.command), p.command);
+		rv = 0;
+		goto fail;
+	}
+
+	if (p.length != resp_size) {
+		dev_err(DEV, "expected AuthResponse payload of wrong size\n");
+		rv = 0;
+		goto fail;
+	}
+
+	rv = drbd_recv(mdev, response , resp_size);
+
+	if (rv != resp_size) {
+		dev_err(DEV, "short read receiving AuthResponse: l=%u\n", rv);
+		rv = 0;
+		goto fail;
+	}
+
+	right_response = kmalloc(resp_size, GFP_NOIO);
+	if (response == NULL) {
+		dev_err(DEV, "kmalloc of right_response failed\n");
+		rv = 0;
+		goto fail;
+	}
+
+	sg_set_buf(&sg, my_challenge, CHALLENGE_LEN);
+
+	rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
+	if (rv) {
+		dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
+		rv = 0;
+		goto fail;
+	}
+
+	rv = !memcmp(response, right_response, resp_size);
+
+	if (rv)
+		dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n",
+		     resp_size, mdev->net_conf->cram_hmac_alg);
+
+ fail:
+	kfree(peers_ch);
+	kfree(response);
+	kfree(right_response);
+
+	return rv;
+}
+#endif
+
+int drbdd_init(struct drbd_thread *thi)
+{
+	struct drbd_conf *mdev = thi->mdev;
+	unsigned int minor = mdev_to_minor(mdev);
+	int h;
+
+	sprintf(current->comm, "drbd%d_receiver", minor);
+
+	dev_info(DEV, "receiver (re)started\n");
+
+	do {
+		h = drbd_connect(mdev);
+		if (h == 0) {
+			drbd_disconnect(mdev);
+			__set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(HZ);
+		}
+		if (h == -1) {
+			dev_warn(DEV, "Discarding network configuration.\n");
+			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+		}
+	} while (h == 0);
+
+	if (h > 0) {
+		if (get_net_conf(mdev)) {
+			drbdd(mdev);
+			put_net_conf(mdev);
+		}
+	}
+
+	drbd_disconnect(mdev);
+
+	dev_info(DEV, "receiver terminated\n");
+	return 0;
+}
+
+/* ********* acknowledge sender ******** */
+
+static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_req_state_reply *p = (struct p_req_state_reply *)h;
+
+	int retcode = be32_to_cpu(p->retcode);
+
+	if (retcode >= SS_SUCCESS) {
+		set_bit(CL_ST_CHG_SUCCESS, &mdev->flags);
+	} else {
+		set_bit(CL_ST_CHG_FAIL, &mdev->flags);
+		dev_err(DEV, "Requested state change failed by peer: %s (%d)\n",
+		    drbd_set_st_err_str(retcode), retcode);
+	}
+	wake_up(&mdev->state_wait);
+
+	return TRUE;
+}
+
+static int got_Ping(struct drbd_conf *mdev, struct p_header *h)
+{
+	return drbd_send_ping_ack(mdev);
+
+}
+
+static int got_PingAck(struct drbd_conf *mdev, struct p_header *h)
+{
+	/* restore idle timeout */
+	mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
+
+	return TRUE;
+}
+
+static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_block_ack *p = (struct p_block_ack *)h;
+	sector_t sector = be64_to_cpu(p->sector);
+	int blksize = be32_to_cpu(p->blksize);
+
+	D_ASSERT(mdev->agreed_pro_version >= 89);
+
+	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+
+	drbd_rs_complete_io(mdev, sector);
+	drbd_set_in_sync(mdev, sector, blksize);
+	/* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
+	mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
+	dec_rs_pending(mdev);
+
+	return TRUE;
+}
+
+/* when we receive the ACK for a write request,
+ * verify that we actually know about it */
+static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev,
+	u64 id, sector_t sector)
+{
+	struct hlist_head *slot = tl_hash_slot(mdev, sector);
+	struct hlist_node *n;
+	struct drbd_request *req;
+
+	hlist_for_each_entry(req, n, slot, colision) {
+		if ((unsigned long)req == (unsigned long)id) {
+			if (req->sector != sector) {
+				dev_err(DEV, "_ack_id_to_req: found req %p but it has "
+				    "wrong sector (%llus versus %llus)\n", req,
+				    (unsigned long long)req->sector,
+				    (unsigned long long)sector);
+				break;
+			}
+			return req;
+		}
+	}
+	dev_err(DEV, "_ack_id_to_req: failed to find req %p, sector %llus in list\n",
+		(void *)(unsigned long)id, (unsigned long long)sector);
+	return NULL;
+}
+
+typedef struct drbd_request *(req_validator_fn)
+	(struct drbd_conf *mdev, u64 id, sector_t sector);
+
+static int validate_req_change_req_state(struct drbd_conf *mdev,
+	u64 id, sector_t sector, req_validator_fn validator,
+	const char *func, enum drbd_req_event what)
+{
+	struct drbd_request *req;
+	struct bio_and_error m;
+
+	spin_lock_irq(&mdev->req_lock);
+	req = validator(mdev, id, sector);
+	if (unlikely(!req)) {
+		spin_unlock_irq(&mdev->req_lock);
+		dev_err(DEV, "%s: got a corrupt block_id/sector pair\n", func);
+		return FALSE;
+	}
+	__req_mod(req, what, &m);
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (m.bio)
+		complete_master_bio(mdev, &m);
+	return TRUE;
+}
+
+static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_block_ack *p = (struct p_block_ack *)h;
+	sector_t sector = be64_to_cpu(p->sector);
+	int blksize = be32_to_cpu(p->blksize);
+	enum drbd_req_event what;
+
+	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+
+	if (is_syncer_block_id(p->block_id)) {
+		drbd_set_in_sync(mdev, sector, blksize);
+		dec_rs_pending(mdev);
+		return TRUE;
+	}
+	switch (be16_to_cpu(h->command)) {
+	case P_RS_WRITE_ACK:
+		D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
+		what = write_acked_by_peer_and_sis;
+		break;
+	case P_WRITE_ACK:
+		D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
+		what = write_acked_by_peer;
+		break;
+	case P_RECV_ACK:
+		D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B);
+		what = recv_acked_by_peer;
+		break;
+	case P_DISCARD_ACK:
+		D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
+		what = conflict_discarded_by_peer;
+		break;
+	default:
+		D_ASSERT(0);
+		return FALSE;
+	}
+
+	return validate_req_change_req_state(mdev, p->block_id, sector,
+		_ack_id_to_req, __func__ , what);
+}
+
+static int got_NegAck(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_block_ack *p = (struct p_block_ack *)h;
+	sector_t sector = be64_to_cpu(p->sector);
+
+	if (__ratelimit(&drbd_ratelimit_state))
+		dev_warn(DEV, "Got NegAck packet. Peer is in troubles?\n");
+
+	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+
+	if (is_syncer_block_id(p->block_id)) {
+		int size = be32_to_cpu(p->blksize);
+		dec_rs_pending(mdev);
+		drbd_rs_failed_io(mdev, sector, size);
+		return TRUE;
+	}
+	return validate_req_change_req_state(mdev, p->block_id, sector,
+		_ack_id_to_req, __func__ , neg_acked);
+}
+
+static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_block_ack *p = (struct p_block_ack *)h;
+	sector_t sector = be64_to_cpu(p->sector);
+
+	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+	dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n",
+	    (unsigned long long)sector, be32_to_cpu(p->blksize));
+
+	return validate_req_change_req_state(mdev, p->block_id, sector,
+		_ar_id_to_req, __func__ , neg_acked);
+}
+
+static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
+{
+	sector_t sector;
+	int size;
+	struct p_block_ack *p = (struct p_block_ack *)h;
+
+	sector = be64_to_cpu(p->sector);
+	size = be32_to_cpu(p->blksize);
+	D_ASSERT(p->block_id == ID_SYNCER);
+
+	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+
+	dec_rs_pending(mdev);
+
+	if (get_ldev_if_state(mdev, D_FAILED)) {
+		drbd_rs_complete_io(mdev, sector);
+		drbd_rs_failed_io(mdev, sector, size);
+		put_ldev(mdev);
+	}
+
+	return TRUE;
+}
+
+static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_barrier_ack *p = (struct p_barrier_ack *)h;
+
+	tl_release(mdev, p->barrier, be32_to_cpu(p->set_size));
+
+	return TRUE;
+}
+
+static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
+{
+	struct p_block_ack *p = (struct p_block_ack *)h;
+	struct drbd_work *w;
+	sector_t sector;
+	int size;
+
+	sector = be64_to_cpu(p->sector);
+	size = be32_to_cpu(p->blksize);
+
+	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
+
+	if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
+		drbd_ov_oos_found(mdev, sector, size);
+	else
+		ov_oos_print(mdev);
+
+	drbd_rs_complete_io(mdev, sector);
+	dec_rs_pending(mdev);
+
+	if (--mdev->ov_left == 0) {
+		w = kmalloc(sizeof(*w), GFP_NOIO);
+		if (w) {
+			w->cb = w_ov_finished;
+			drbd_queue_work_front(&mdev->data.work, w);
+		} else {
+			dev_err(DEV, "kmalloc(w) failed.");
+			ov_oos_print(mdev);
+			drbd_resync_finished(mdev);
+		}
+	}
+	return TRUE;
+}
+
+struct asender_cmd {
+	size_t pkt_size;
+	int (*process)(struct drbd_conf *mdev, struct p_header *h);
+};
+
+static struct asender_cmd *get_asender_cmd(int cmd)
+{
+	static struct asender_cmd asender_tbl[] = {
+		/* anything missing from this table is in
+		 * the drbd_cmd_handler (drbd_default_handler) table,
+		 * see the beginning of drbdd() */
+	[P_PING]	    = { sizeof(struct p_header), got_Ping },
+	[P_PING_ACK]	    = { sizeof(struct p_header), got_PingAck },
+	[P_RECV_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_WRITE_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_RS_WRITE_ACK]    = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_DISCARD_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_NEG_ACK]	    = { sizeof(struct p_block_ack), got_NegAck },
+	[P_NEG_DREPLY]	    = { sizeof(struct p_block_ack), got_NegDReply },
+	[P_NEG_RS_DREPLY]   = { sizeof(struct p_block_ack), got_NegRSDReply},
+	[P_OV_RESULT]	    = { sizeof(struct p_block_ack), got_OVResult },
+	[P_BARRIER_ACK]	    = { sizeof(struct p_barrier_ack), got_BarrierAck },
+	[P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
+	[P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
+	[P_MAX_CMD]	    = { 0, NULL },
+	};
+	if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
+		return NULL;
+	return &asender_tbl[cmd];
+}
+
+int drbd_asender(struct drbd_thread *thi)
+{
+	struct drbd_conf *mdev = thi->mdev;
+	struct p_header *h = &mdev->meta.rbuf.header;
+	struct asender_cmd *cmd = NULL;
+
+	int rv, len;
+	void *buf    = h;
+	int received = 0;
+	int expect   = sizeof(struct p_header);
+	int empty;
+
+	sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
+
+	current->policy = SCHED_RR;  /* Make this a realtime task! */
+	current->rt_priority = 2;    /* more important than all other tasks */
+
+	while (get_t_state(thi) == Running) {
+		drbd_thread_current_set_cpu(mdev);
+		if (test_and_clear_bit(SEND_PING, &mdev->flags)) {
+			ERR_IF(!drbd_send_ping(mdev)) goto reconnect;
+			mdev->meta.socket->sk->sk_rcvtimeo =
+				mdev->net_conf->ping_timeo*HZ/10;
+		}
+
+		/* conditionally cork;
+		 * it may hurt latency if we cork without much to send */
+		if (!mdev->net_conf->no_cork &&
+			3 < atomic_read(&mdev->unacked_cnt))
+			drbd_tcp_cork(mdev->meta.socket);
+		while (1) {
+			clear_bit(SIGNAL_ASENDER, &mdev->flags);
+			flush_signals(current);
+			if (!drbd_process_done_ee(mdev)) {
+				dev_err(DEV, "process_done_ee() = NOT_OK\n");
+				goto reconnect;
+			}
+			/* to avoid race with newly queued ACKs */
+			set_bit(SIGNAL_ASENDER, &mdev->flags);
+			spin_lock_irq(&mdev->req_lock);
+			empty = list_empty(&mdev->done_ee);
+			spin_unlock_irq(&mdev->req_lock);
+			/* new ack may have been queued right here,
+			 * but then there is also a signal pending,
+			 * and we start over... */
+			if (empty)
+				break;
+		}
+		/* but unconditionally uncork unless disabled */
+		if (!mdev->net_conf->no_cork)
+			drbd_tcp_uncork(mdev->meta.socket);
+
+		/* short circuit, recv_msg would return EINTR anyways. */
+		if (signal_pending(current))
+			continue;
+
+		rv = drbd_recv_short(mdev, mdev->meta.socket,
+				     buf, expect-received, 0);
+		clear_bit(SIGNAL_ASENDER, &mdev->flags);
+
+		flush_signals(current);
+
+		/* Note:
+		 * -EINTR	 (on meta) we got a signal
+		 * -EAGAIN	 (on meta) rcvtimeo expired
+		 * -ECONNRESET	 other side closed the connection
+		 * -ERESTARTSYS  (on data) we got a signal
+		 * rv <  0	 other than above: unexpected error!
+		 * rv == expected: full header or command
+		 * rv <  expected: "woken" by signal during receive
+		 * rv == 0	 : "connection shut down by peer"
+		 */
+		if (likely(rv > 0)) {
+			received += rv;
+			buf	 += rv;
+		} else if (rv == 0) {
+			dev_err(DEV, "meta connection shut down by peer.\n");
+			goto reconnect;
+		} else if (rv == -EAGAIN) {
+			if (mdev->meta.socket->sk->sk_rcvtimeo ==
+			    mdev->net_conf->ping_timeo*HZ/10) {
+				dev_err(DEV, "PingAck did not arrive in time.\n");
+				goto reconnect;
+			}
+			set_bit(SEND_PING, &mdev->flags);
+			continue;
+		} else if (rv == -EINTR) {
+			continue;
+		} else {
+			dev_err(DEV, "sock_recvmsg returned %d\n", rv);
+			goto reconnect;
+		}
+
+		if (received == expect && cmd == NULL) {
+			if (unlikely(h->magic != BE_DRBD_MAGIC)) {
+				dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n",
+				    (long)be32_to_cpu(h->magic),
+				    h->command, h->length);
+				goto reconnect;
+			}
+			cmd = get_asender_cmd(be16_to_cpu(h->command));
+			len = be16_to_cpu(h->length);
+			if (unlikely(cmd == NULL)) {
+				dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n",
+				    (long)be32_to_cpu(h->magic),
+				    h->command, h->length);
+				goto disconnect;
+			}
+			expect = cmd->pkt_size;
+			ERR_IF(len != expect-sizeof(struct p_header)) {
+				trace_drbd_packet(mdev, mdev->meta.socket, 1, (void *)h, __FILE__, __LINE__);
+				goto reconnect;
+			}
+		}
+		if (received == expect) {
+			D_ASSERT(cmd != NULL);
+			trace_drbd_packet(mdev, mdev->meta.socket, 1, (void *)h, __FILE__, __LINE__);
+			if (!cmd->process(mdev, h))
+				goto reconnect;
+
+			buf	 = h;
+			received = 0;
+			expect	 = sizeof(struct p_header);
+			cmd	 = NULL;
+		}
+	}
+
+	if (0) {
+reconnect:
+		drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
+	}
+	if (0) {
+disconnect:
+		drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+	}
+	clear_bit(SIGNAL_ASENDER, &mdev->flags);
+
+	D_ASSERT(mdev->state.conn < C_CONNECTED);
+	dev_info(DEV, "asender terminated\n");
+
+	return 0;
+}
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
new file mode 100644
index 000000000000..0656cf1edd57
--- /dev/null
+++ b/drivers/block/drbd/drbd_req.c
@@ -0,0 +1,1132 @@
+/*
+   drbd_req.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/autoconf.h>
+#include <linux/module.h>
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+#include "drbd_tracing.h"
+#include "drbd_req.h"
+
+
+/* Update disk stats at start of I/O request */
+static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio)
+{
+	const int rw = bio_data_dir(bio);
+	int cpu;
+	cpu = part_stat_lock();
+	part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
+	part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
+	part_stat_unlock();
+	mdev->vdisk->part0.in_flight[rw]++;
+}
+
+/* Update disk stats when completing request upwards */
+static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
+{
+	int rw = bio_data_dir(req->master_bio);
+	unsigned long duration = jiffies - req->start_time;
+	int cpu;
+	cpu = part_stat_lock();
+	part_stat_add(cpu, &mdev->vdisk->part0, ticks[rw], duration);
+	part_round_stats(cpu, &mdev->vdisk->part0);
+	part_stat_unlock();
+	mdev->vdisk->part0.in_flight[rw]--;
+}
+
+static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw)
+{
+	const unsigned long s = req->rq_state;
+	/* if it was a write, we may have to set the corresponding
+	 * bit(s) out-of-sync first. If it had a local part, we need to
+	 * release the reference to the activity log. */
+	if (rw == WRITE) {
+		/* remove it from the transfer log.
+		 * well, only if it had been there in the first
+		 * place... if it had not (local only or conflicting
+		 * and never sent), it should still be "empty" as
+		 * initialized in drbd_req_new(), so we can list_del() it
+		 * here unconditionally */
+		list_del(&req->tl_requests);
+		/* Set out-of-sync unless both OK flags are set
+		 * (local only or remote failed).
+		 * Other places where we set out-of-sync:
+		 * READ with local io-error */
+		if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
+			drbd_set_out_of_sync(mdev, req->sector, req->size);
+
+		if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
+			drbd_set_in_sync(mdev, req->sector, req->size);
+
+		/* one might be tempted to move the drbd_al_complete_io
+		 * to the local io completion callback drbd_endio_pri.
+		 * but, if this was a mirror write, we may only
+		 * drbd_al_complete_io after this is RQ_NET_DONE,
+		 * otherwise the extent could be dropped from the al
+		 * before it has actually been written on the peer.
+		 * if we crash before our peer knows about the request,
+		 * but after the extent has been dropped from the al,
+		 * we would forget to resync the corresponding extent.
+		 */
+		if (s & RQ_LOCAL_MASK) {
+			if (get_ldev_if_state(mdev, D_FAILED)) {
+				drbd_al_complete_io(mdev, req->sector);
+				put_ldev(mdev);
+			} else if (__ratelimit(&drbd_ratelimit_state)) {
+				dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), "
+				     "but my Disk seems to have failed :(\n",
+				     (unsigned long long) req->sector);
+			}
+		}
+	}
+
+	/* if it was a local io error, we want to notify our
+	 * peer about that, and see if we need to
+	 * detach the disk and stuff.
+	 * to avoid allocating some special work
+	 * struct, reuse the request. */
+
+	/* THINK
+	 * why do we do this not when we detect the error,
+	 * but delay it until it is "done", i.e. possibly
+	 * until the next barrier ack? */
+
+	if (rw == WRITE &&
+	    ((s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) {
+		if (!(req->w.list.next == LIST_POISON1 ||
+		      list_empty(&req->w.list))) {
+			/* DEBUG ASSERT only; if this triggers, we
+			 * probably corrupt the worker list here */
+			dev_err(DEV, "req->w.list.next = %p\n", req->w.list.next);
+			dev_err(DEV, "req->w.list.prev = %p\n", req->w.list.prev);
+		}
+		req->w.cb = w_io_error;
+		drbd_queue_work(&mdev->data.work, &req->w);
+		/* drbd_req_free() is done in w_io_error */
+	} else {
+		drbd_req_free(req);
+	}
+}
+
+static void queue_barrier(struct drbd_conf *mdev)
+{
+	struct drbd_tl_epoch *b;
+
+	/* We are within the req_lock. Once we queued the barrier for sending,
+	 * we set the CREATE_BARRIER bit. It is cleared as soon as a new
+	 * barrier/epoch object is added. This is the only place this bit is
+	 * set. It indicates that the barrier for this epoch is already queued,
+	 * and no new epoch has been created yet. */
+	if (test_bit(CREATE_BARRIER, &mdev->flags))
+		return;
+
+	b = mdev->newest_tle;
+	b->w.cb = w_send_barrier;
+	/* inc_ap_pending done here, so we won't
+	 * get imbalanced on connection loss.
+	 * dec_ap_pending will be done in got_BarrierAck
+	 * or (on connection loss) in tl_clear.  */
+	inc_ap_pending(mdev);
+	drbd_queue_work(&mdev->data.work, &b->w);
+	set_bit(CREATE_BARRIER, &mdev->flags);
+}
+
+static void _about_to_complete_local_write(struct drbd_conf *mdev,
+	struct drbd_request *req)
+{
+	const unsigned long s = req->rq_state;
+	struct drbd_request *i;
+	struct drbd_epoch_entry *e;
+	struct hlist_node *n;
+	struct hlist_head *slot;
+
+	/* before we can signal completion to the upper layers,
+	 * we may need to close the current epoch */
+	if (mdev->state.conn >= C_CONNECTED &&
+	    req->epoch == mdev->newest_tle->br_number)
+		queue_barrier(mdev);
+
+	/* we need to do the conflict detection stuff,
+	 * if we have the ee_hash (two_primaries) and
+	 * this has been on the network */
+	if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) {
+		const sector_t sector = req->sector;
+		const int size = req->size;
+
+		/* ASSERT:
+		 * there must be no conflicting requests, since
+		 * they must have been failed on the spot */
+#define OVERLAPS overlaps(sector, size, i->sector, i->size)
+		slot = tl_hash_slot(mdev, sector);
+		hlist_for_each_entry(i, n, slot, colision) {
+			if (OVERLAPS) {
+				dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; "
+				      "other: %p %llus +%u\n",
+				      req, (unsigned long long)sector, size,
+				      i, (unsigned long long)i->sector, i->size);
+			}
+		}
+
+		/* maybe "wake" those conflicting epoch entries
+		 * that wait for this request to finish.
+		 *
+		 * currently, there can be only _one_ such ee
+		 * (well, or some more, which would be pending
+		 * P_DISCARD_ACK not yet sent by the asender...),
+		 * since we block the receiver thread upon the
+		 * first conflict detection, which will wait on
+		 * misc_wait.  maybe we want to assert that?
+		 *
+		 * anyways, if we found one,
+		 * we just have to do a wake_up.  */
+#undef OVERLAPS
+#define OVERLAPS overlaps(sector, size, e->sector, e->size)
+		slot = ee_hash_slot(mdev, req->sector);
+		hlist_for_each_entry(e, n, slot, colision) {
+			if (OVERLAPS) {
+				wake_up(&mdev->misc_wait);
+				break;
+			}
+		}
+	}
+#undef OVERLAPS
+}
+
+void complete_master_bio(struct drbd_conf *mdev,
+		struct bio_and_error *m)
+{
+	trace_drbd_bio(mdev, "Rq", m->bio, 1, NULL);
+	bio_endio(m->bio, m->error);
+	dec_ap_bio(mdev);
+}
+
+/* Helper for __req_mod().
+ * Set m->bio to the master bio, if it is fit to be completed,
+ * or leave it alone (it is initialized to NULL in __req_mod),
+ * if it has already been completed, or cannot be completed yet.
+ * If m->bio is set, the error status to be returned is placed in m->error.
+ */
+void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
+{
+	const unsigned long s = req->rq_state;
+	struct drbd_conf *mdev = req->mdev;
+	/* only WRITES may end up here without a master bio (on barrier ack) */
+	int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE;
+
+	trace_drbd_req(req, nothing, "_req_may_be_done");
+
+	/* we must not complete the master bio, while it is
+	 *	still being processed by _drbd_send_zc_bio (drbd_send_dblock)
+	 *	not yet acknowledged by the peer
+	 *	not yet completed by the local io subsystem
+	 * these flags may get cleared in any order by
+	 *	the worker,
+	 *	the receiver,
+	 *	the bio_endio completion callbacks.
+	 */
+	if (s & RQ_NET_QUEUED)
+		return;
+	if (s & RQ_NET_PENDING)
+		return;
+	if (s & RQ_LOCAL_PENDING)
+		return;
+
+	if (req->master_bio) {
+		/* this is data_received (remote read)
+		 * or protocol C P_WRITE_ACK
+		 * or protocol B P_RECV_ACK
+		 * or protocol A "handed_over_to_network" (SendAck)
+		 * or canceled or failed,
+		 * or killed from the transfer log due to connection loss.
+		 */
+
+		/*
+		 * figure out whether to report success or failure.
+		 *
+		 * report success when at least one of the operations succeeded.
+		 * or, to put the other way,
+		 * only report failure, when both operations failed.
+		 *
+		 * what to do about the failures is handled elsewhere.
+		 * what we need to do here is just: complete the master_bio.
+		 *
+		 * local completion error, if any, has been stored as ERR_PTR
+		 * in private_bio within drbd_endio_pri.
+		 */
+		int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
+		int error = PTR_ERR(req->private_bio);
+
+		/* remove the request from the conflict detection
+		 * respective block_id verification hash */
+		if (!hlist_unhashed(&req->colision))
+			hlist_del(&req->colision);
+		else
+			D_ASSERT((s & RQ_NET_MASK) == 0);
+
+		/* for writes we need to do some extra housekeeping */
+		if (rw == WRITE)
+			_about_to_complete_local_write(mdev, req);
+
+		/* Update disk stats */
+		_drbd_end_io_acct(mdev, req);
+
+		m->error = ok ? 0 : (error ?: -EIO);
+		m->bio = req->master_bio;
+		req->master_bio = NULL;
+	}
+
+	if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) {
+		/* this is disconnected (local only) operation,
+		 * or protocol C P_WRITE_ACK,
+		 * or protocol A or B P_BARRIER_ACK,
+		 * or killed from the transfer log due to connection loss. */
+		_req_is_done(mdev, req, rw);
+	}
+	/* else: network part and not DONE yet. that is
+	 * protocol A or B, barrier ack still pending... */
+}
+
+/*
+ * checks whether there was an overlapping request
+ * or ee already registered.
+ *
+ * if so, return 1, in which case this request is completed on the spot,
+ * without ever being submitted or send.
+ *
+ * return 0 if it is ok to submit this request.
+ *
+ * NOTE:
+ * paranoia: assume something above us is broken, and issues different write
+ * requests for the same block simultaneously...
+ *
+ * To ensure these won't be reordered differently on both nodes, resulting in
+ * diverging data sets, we discard the later one(s). Not that this is supposed
+ * to happen, but this is the rationale why we also have to check for
+ * conflicting requests with local origin, and why we have to do so regardless
+ * of whether we allowed multiple primaries.
+ *
+ * BTW, in case we only have one primary, the ee_hash is empty anyways, and the
+ * second hlist_for_each_entry becomes a noop. This is even simpler than to
+ * grab a reference on the net_conf, and check for the two_primaries flag...
+ */
+static int _req_conflicts(struct drbd_request *req)
+{
+	struct drbd_conf *mdev = req->mdev;
+	const sector_t sector = req->sector;
+	const int size = req->size;
+	struct drbd_request *i;
+	struct drbd_epoch_entry *e;
+	struct hlist_node *n;
+	struct hlist_head *slot;
+
+	D_ASSERT(hlist_unhashed(&req->colision));
+
+	if (!get_net_conf(mdev))
+		return 0;
+
+	/* BUG_ON */
+	ERR_IF (mdev->tl_hash_s == 0)
+		goto out_no_conflict;
+	BUG_ON(mdev->tl_hash == NULL);
+
+#define OVERLAPS overlaps(i->sector, i->size, sector, size)
+	slot = tl_hash_slot(mdev, sector);
+	hlist_for_each_entry(i, n, slot, colision) {
+		if (OVERLAPS) {
+			dev_alert(DEV, "%s[%u] Concurrent local write detected! "
+			      "[DISCARD L] new: %llus +%u; "
+			      "pending: %llus +%u\n",
+			      current->comm, current->pid,
+			      (unsigned long long)sector, size,
+			      (unsigned long long)i->sector, i->size);
+			goto out_conflict;
+		}
+	}
+
+	if (mdev->ee_hash_s) {
+		/* now, check for overlapping requests with remote origin */
+		BUG_ON(mdev->ee_hash == NULL);
+#undef OVERLAPS
+#define OVERLAPS overlaps(e->sector, e->size, sector, size)
+		slot = ee_hash_slot(mdev, sector);
+		hlist_for_each_entry(e, n, slot, colision) {
+			if (OVERLAPS) {
+				dev_alert(DEV, "%s[%u] Concurrent remote write detected!"
+				      " [DISCARD L] new: %llus +%u; "
+				      "pending: %llus +%u\n",
+				      current->comm, current->pid,
+				      (unsigned long long)sector, size,
+				      (unsigned long long)e->sector, e->size);
+				goto out_conflict;
+			}
+		}
+	}
+#undef OVERLAPS
+
+out_no_conflict:
+	/* this is like it should be, and what we expected.
+	 * our users do behave after all... */
+	put_net_conf(mdev);
+	return 0;
+
+out_conflict:
+	put_net_conf(mdev);
+	return 1;
+}
+
+/* obviously this could be coded as many single functions
+ * instead of one huge switch,
+ * or by putting the code directly in the respective locations
+ * (as it has been before).
+ *
+ * but having it this way
+ *  enforces that it is all in this one place, where it is easier to audit,
+ *  it makes it obvious that whatever "event" "happens" to a request should
+ *  happen "atomically" within the req_lock,
+ *  and it enforces that we have to think in a very structured manner
+ *  about the "events" that may happen to a request during its life time ...
+ */
+void __req_mod(struct drbd_request *req, enum drbd_req_event what,
+		struct bio_and_error *m)
+{
+	struct drbd_conf *mdev = req->mdev;
+	m->bio = NULL;
+
+	trace_drbd_req(req, what, NULL);
+
+	switch (what) {
+	default:
+		dev_err(DEV, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__);
+		break;
+
+	/* does not happen...
+	 * initialization done in drbd_req_new
+	case created:
+		break;
+		*/
+
+	case to_be_send: /* via network */
+		/* reached via drbd_make_request_common
+		 * and from w_read_retry_remote */
+		D_ASSERT(!(req->rq_state & RQ_NET_MASK));
+		req->rq_state |= RQ_NET_PENDING;
+		inc_ap_pending(mdev);
+		break;
+
+	case to_be_submitted: /* locally */
+		/* reached via drbd_make_request_common */
+		D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK));
+		req->rq_state |= RQ_LOCAL_PENDING;
+		break;
+
+	case completed_ok:
+		if (bio_data_dir(req->master_bio) == WRITE)
+			mdev->writ_cnt += req->size>>9;
+		else
+			mdev->read_cnt += req->size>>9;
+
+		req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
+		req->rq_state &= ~RQ_LOCAL_PENDING;
+
+		_req_may_be_done(req, m);
+		put_ldev(mdev);
+		break;
+
+	case write_completed_with_error:
+		req->rq_state |= RQ_LOCAL_COMPLETED;
+		req->rq_state &= ~RQ_LOCAL_PENDING;
+
+		dev_alert(DEV, "Local WRITE failed sec=%llus size=%u\n",
+		      (unsigned long long)req->sector, req->size);
+		/* and now: check how to handle local io error. */
+		__drbd_chk_io_error(mdev, FALSE);
+		_req_may_be_done(req, m);
+		put_ldev(mdev);
+		break;
+
+	case read_ahead_completed_with_error:
+		/* it is legal to fail READA */
+		req->rq_state |= RQ_LOCAL_COMPLETED;
+		req->rq_state &= ~RQ_LOCAL_PENDING;
+		_req_may_be_done(req, m);
+		put_ldev(mdev);
+		break;
+
+	case read_completed_with_error:
+		drbd_set_out_of_sync(mdev, req->sector, req->size);
+
+		req->rq_state |= RQ_LOCAL_COMPLETED;
+		req->rq_state &= ~RQ_LOCAL_PENDING;
+
+		dev_alert(DEV, "Local READ failed sec=%llus size=%u\n",
+		      (unsigned long long)req->sector, req->size);
+		/* _req_mod(req,to_be_send); oops, recursion... */
+		D_ASSERT(!(req->rq_state & RQ_NET_MASK));
+		req->rq_state |= RQ_NET_PENDING;
+		inc_ap_pending(mdev);
+
+		__drbd_chk_io_error(mdev, FALSE);
+		put_ldev(mdev);
+		/* NOTE: if we have no connection,
+		 * or know the peer has no good data either,
+		 * then we don't actually need to "queue_for_net_read",
+		 * but we do so anyways, since the drbd_io_error()
+		 * and the potential state change to "Diskless"
+		 * needs to be done from process context */
+
+		/* fall through: _req_mod(req,queue_for_net_read); */
+
+	case queue_for_net_read:
+		/* READ or READA, and
+		 * no local disk,
+		 * or target area marked as invalid,
+		 * or just got an io-error. */
+		/* from drbd_make_request_common
+		 * or from bio_endio during read io-error recovery */
+
+		/* so we can verify the handle in the answer packet
+		 * corresponding hlist_del is in _req_may_be_done() */
+		hlist_add_head(&req->colision, ar_hash_slot(mdev, req->sector));
+
+		set_bit(UNPLUG_REMOTE, &mdev->flags); /* why? */
+
+		D_ASSERT(req->rq_state & RQ_NET_PENDING);
+		req->rq_state |= RQ_NET_QUEUED;
+		req->w.cb = (req->rq_state & RQ_LOCAL_MASK)
+			? w_read_retry_remote
+			: w_send_read_req;
+		drbd_queue_work(&mdev->data.work, &req->w);
+		break;
+
+	case queue_for_net_write:
+		/* assert something? */
+		/* from drbd_make_request_common only */
+
+		hlist_add_head(&req->colision, tl_hash_slot(mdev, req->sector));
+		/* corresponding hlist_del is in _req_may_be_done() */
+
+		/* NOTE
+		 * In case the req ended up on the transfer log before being
+		 * queued on the worker, it could lead to this request being
+		 * missed during cleanup after connection loss.
+		 * So we have to do both operations here,
+		 * within the same lock that protects the transfer log.
+		 *
+		 * _req_add_to_epoch(req); this has to be after the
+		 * _maybe_start_new_epoch(req); which happened in
+		 * drbd_make_request_common, because we now may set the bit
+		 * again ourselves to close the current epoch.
+		 *
+		 * Add req to the (now) current epoch (barrier). */
+
+		/* see drbd_make_request_common,
+		 * just after it grabs the req_lock */
+		D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0);
+
+		req->epoch = mdev->newest_tle->br_number;
+		list_add_tail(&req->tl_requests,
+				&mdev->newest_tle->requests);
+
+		/* increment size of current epoch */
+		mdev->newest_tle->n_req++;
+
+		/* queue work item to send data */
+		D_ASSERT(req->rq_state & RQ_NET_PENDING);
+		req->rq_state |= RQ_NET_QUEUED;
+		req->w.cb =  w_send_dblock;
+		drbd_queue_work(&mdev->data.work, &req->w);
+
+		/* close the epoch, in case it outgrew the limit */
+		if (mdev->newest_tle->n_req >= mdev->net_conf->max_epoch_size)
+			queue_barrier(mdev);
+
+		break;
+
+	case send_canceled:
+		/* treat it the same */
+	case send_failed:
+		/* real cleanup will be done from tl_clear.  just update flags
+		 * so it is no longer marked as on the worker queue */
+		req->rq_state &= ~RQ_NET_QUEUED;
+		/* if we did it right, tl_clear should be scheduled only after
+		 * this, so this should not be necessary! */
+		_req_may_be_done(req, m);
+		break;
+
+	case handed_over_to_network:
+		/* assert something? */
+		if (bio_data_dir(req->master_bio) == WRITE &&
+		    mdev->net_conf->wire_protocol == DRBD_PROT_A) {
+			/* this is what is dangerous about protocol A:
+			 * pretend it was successfully written on the peer. */
+			if (req->rq_state & RQ_NET_PENDING) {
+				dec_ap_pending(mdev);
+				req->rq_state &= ~RQ_NET_PENDING;
+				req->rq_state |= RQ_NET_OK;
+			} /* else: neg-ack was faster... */
+			/* it is still not yet RQ_NET_DONE until the
+			 * corresponding epoch barrier got acked as well,
+			 * so we know what to dirty on connection loss */
+		}
+		req->rq_state &= ~RQ_NET_QUEUED;
+		req->rq_state |= RQ_NET_SENT;
+		/* because _drbd_send_zc_bio could sleep, and may want to
+		 * dereference the bio even after the "write_acked_by_peer" and
+		 * "completed_ok" events came in, once we return from
+		 * _drbd_send_zc_bio (drbd_send_dblock), we have to check
+		 * whether it is done already, and end it.  */
+		_req_may_be_done(req, m);
+		break;
+
+	case connection_lost_while_pending:
+		/* transfer log cleanup after connection loss */
+		/* assert something? */
+		if (req->rq_state & RQ_NET_PENDING)
+			dec_ap_pending(mdev);
+		req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
+		req->rq_state |= RQ_NET_DONE;
+		/* if it is still queued, we may not complete it here.
+		 * it will be canceled soon. */
+		if (!(req->rq_state & RQ_NET_QUEUED))
+			_req_may_be_done(req, m);
+		break;
+
+	case write_acked_by_peer_and_sis:
+		req->rq_state |= RQ_NET_SIS;
+	case conflict_discarded_by_peer:
+		/* for discarded conflicting writes of multiple primaries,
+		 * there is no need to keep anything in the tl, potential
+		 * node crashes are covered by the activity log. */
+		if (what == conflict_discarded_by_peer)
+			dev_alert(DEV, "Got DiscardAck packet %llus +%u!"
+			      " DRBD is not a random data generator!\n",
+			      (unsigned long long)req->sector, req->size);
+		req->rq_state |= RQ_NET_DONE;
+		/* fall through */
+	case write_acked_by_peer:
+		/* protocol C; successfully written on peer.
+		 * Nothing to do here.
+		 * We want to keep the tl in place for all protocols, to cater
+		 * for volatile write-back caches on lower level devices.
+		 *
+		 * A barrier request is expected to have forced all prior
+		 * requests onto stable storage, so completion of a barrier
+		 * request could set NET_DONE right here, and not wait for the
+		 * P_BARRIER_ACK, but that is an unnecessary optimization. */
+
+		/* this makes it effectively the same as for: */
+	case recv_acked_by_peer:
+		/* protocol B; pretends to be successfully written on peer.
+		 * see also notes above in handed_over_to_network about
+		 * protocol != C */
+		req->rq_state |= RQ_NET_OK;
+		D_ASSERT(req->rq_state & RQ_NET_PENDING);
+		dec_ap_pending(mdev);
+		req->rq_state &= ~RQ_NET_PENDING;
+		_req_may_be_done(req, m);
+		break;
+
+	case neg_acked:
+		/* assert something? */
+		if (req->rq_state & RQ_NET_PENDING)
+			dec_ap_pending(mdev);
+		req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
+
+		req->rq_state |= RQ_NET_DONE;
+		_req_may_be_done(req, m);
+		/* else: done by handed_over_to_network */
+		break;
+
+	case barrier_acked:
+		if (req->rq_state & RQ_NET_PENDING) {
+			/* barrier came in before all requests have been acked.
+			 * this is bad, because if the connection is lost now,
+			 * we won't be able to clean them up... */
+			dev_err(DEV, "FIXME (barrier_acked but pending)\n");
+			trace_drbd_req(req, nothing, "FIXME (barrier_acked but pending)");
+			list_move(&req->tl_requests, &mdev->out_of_sequence_requests);
+		}
+		D_ASSERT(req->rq_state & RQ_NET_SENT);
+		req->rq_state |= RQ_NET_DONE;
+		_req_may_be_done(req, m);
+		break;
+
+	case data_received:
+		D_ASSERT(req->rq_state & RQ_NET_PENDING);
+		dec_ap_pending(mdev);
+		req->rq_state &= ~RQ_NET_PENDING;
+		req->rq_state |= (RQ_NET_OK|RQ_NET_DONE);
+		_req_may_be_done(req, m);
+		break;
+	};
+}
+
+/* we may do a local read if:
+ * - we are consistent (of course),
+ * - or we are generally inconsistent,
+ *   BUT we are still/already IN SYNC for this area.
+ *   since size may be bigger than BM_BLOCK_SIZE,
+ *   we may need to check several bits.
+ */
+static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size)
+{
+	unsigned long sbnr, ebnr;
+	sector_t esector, nr_sectors;
+
+	if (mdev->state.disk == D_UP_TO_DATE)
+		return 1;
+	if (mdev->state.disk >= D_OUTDATED)
+		return 0;
+	if (mdev->state.disk <  D_INCONSISTENT)
+		return 0;
+	/* state.disk == D_INCONSISTENT   We will have a look at the BitMap */
+	nr_sectors = drbd_get_capacity(mdev->this_bdev);
+	esector = sector + (size >> 9) - 1;
+
+	D_ASSERT(sector  < nr_sectors);
+	D_ASSERT(esector < nr_sectors);
+
+	sbnr = BM_SECT_TO_BIT(sector);
+	ebnr = BM_SECT_TO_BIT(esector);
+
+	return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr);
+}
+
+static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
+{
+	const int rw = bio_rw(bio);
+	const int size = bio->bi_size;
+	const sector_t sector = bio->bi_sector;
+	struct drbd_tl_epoch *b = NULL;
+	struct drbd_request *req;
+	int local, remote;
+	int err = -EIO;
+
+	/* allocate outside of all locks; */
+	req = drbd_req_new(mdev, bio);
+	if (!req) {
+		dec_ap_bio(mdev);
+		/* only pass the error to the upper layers.
+		 * if user cannot handle io errors, that's not our business. */
+		dev_err(DEV, "could not kmalloc() req\n");
+		bio_endio(bio, -ENOMEM);
+		return 0;
+	}
+
+	trace_drbd_bio(mdev, "Rq", bio, 0, req);
+
+	local = get_ldev(mdev);
+	if (!local) {
+		bio_put(req->private_bio); /* or we get a bio leak */
+		req->private_bio = NULL;
+	}
+	if (rw == WRITE) {
+		remote = 1;
+	} else {
+		/* READ || READA */
+		if (local) {
+			if (!drbd_may_do_local_read(mdev, sector, size)) {
+				/* we could kick the syncer to
+				 * sync this extent asap, wait for
+				 * it, then continue locally.
+				 * Or just issue the request remotely.
+				 */
+				local = 0;
+				bio_put(req->private_bio);
+				req->private_bio = NULL;
+				put_ldev(mdev);
+			}
+		}
+		remote = !local && mdev->state.pdsk >= D_UP_TO_DATE;
+	}
+
+	/* If we have a disk, but a READA request is mapped to remote,
+	 * we are R_PRIMARY, D_INCONSISTENT, SyncTarget.
+	 * Just fail that READA request right here.
+	 *
+	 * THINK: maybe fail all READA when not local?
+	 *        or make this configurable...
+	 *        if network is slow, READA won't do any good.
+	 */
+	if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) {
+		err = -EWOULDBLOCK;
+		goto fail_and_free_req;
+	}
+
+	/* For WRITES going to the local disk, grab a reference on the target
+	 * extent.  This waits for any resync activity in the corresponding
+	 * resync extent to finish, and, if necessary, pulls in the target
+	 * extent into the activity log, which involves further disk io because
+	 * of transactional on-disk meta data updates. */
+	if (rw == WRITE && local)
+		drbd_al_begin_io(mdev, sector);
+
+	remote = remote && (mdev->state.pdsk == D_UP_TO_DATE ||
+			    (mdev->state.pdsk == D_INCONSISTENT &&
+			     mdev->state.conn >= C_CONNECTED));
+
+	if (!(local || remote)) {
+		dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
+		goto fail_free_complete;
+	}
+
+	/* For WRITE request, we have to make sure that we have an
+	 * unused_spare_tle, in case we need to start a new epoch.
+	 * I try to be smart and avoid to pre-allocate always "just in case",
+	 * but there is a race between testing the bit and pointer outside the
+	 * spinlock, and grabbing the spinlock.
+	 * if we lost that race, we retry.  */
+	if (rw == WRITE && remote &&
+	    mdev->unused_spare_tle == NULL &&
+	    test_bit(CREATE_BARRIER, &mdev->flags)) {
+allocate_barrier:
+		b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO);
+		if (!b) {
+			dev_err(DEV, "Failed to alloc barrier.\n");
+			err = -ENOMEM;
+			goto fail_free_complete;
+		}
+	}
+
+	/* GOOD, everything prepared, grab the spin_lock */
+	spin_lock_irq(&mdev->req_lock);
+
+	if (remote) {
+		remote = (mdev->state.pdsk == D_UP_TO_DATE ||
+			    (mdev->state.pdsk == D_INCONSISTENT &&
+			     mdev->state.conn >= C_CONNECTED));
+		if (!remote)
+			dev_warn(DEV, "lost connection while grabbing the req_lock!\n");
+		if (!(local || remote)) {
+			dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
+			spin_unlock_irq(&mdev->req_lock);
+			goto fail_free_complete;
+		}
+	}
+
+	if (b && mdev->unused_spare_tle == NULL) {
+		mdev->unused_spare_tle = b;
+		b = NULL;
+	}
+	if (rw == WRITE && remote &&
+	    mdev->unused_spare_tle == NULL &&
+	    test_bit(CREATE_BARRIER, &mdev->flags)) {
+		/* someone closed the current epoch
+		 * while we were grabbing the spinlock */
+		spin_unlock_irq(&mdev->req_lock);
+		goto allocate_barrier;
+	}
+
+
+	/* Update disk stats */
+	_drbd_start_io_acct(mdev, req, bio);
+
+	/* _maybe_start_new_epoch(mdev);
+	 * If we need to generate a write barrier packet, we have to add the
+	 * new epoch (barrier) object, and queue the barrier packet for sending,
+	 * and queue the req's data after it _within the same lock_, otherwise
+	 * we have race conditions were the reorder domains could be mixed up.
+	 *
+	 * Even read requests may start a new epoch and queue the corresponding
+	 * barrier packet.  To get the write ordering right, we only have to
+	 * make sure that, if this is a write request and it triggered a
+	 * barrier packet, this request is queued within the same spinlock. */
+	if (remote && mdev->unused_spare_tle &&
+	    test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
+		_tl_add_barrier(mdev, mdev->unused_spare_tle);
+		mdev->unused_spare_tle = NULL;
+	} else {
+		D_ASSERT(!(remote && rw == WRITE &&
+			   test_bit(CREATE_BARRIER, &mdev->flags)));
+	}
+
+	/* NOTE
+	 * Actually, 'local' may be wrong here already, since we may have failed
+	 * to write to the meta data, and may become wrong anytime because of
+	 * local io-error for some other request, which would lead to us
+	 * "detaching" the local disk.
+	 *
+	 * 'remote' may become wrong any time because the network could fail.
+	 *
+	 * This is a harmless race condition, though, since it is handled
+	 * correctly at the appropriate places; so it just defers the failure
+	 * of the respective operation.
+	 */
+
+	/* mark them early for readability.
+	 * this just sets some state flags. */
+	if (remote)
+		_req_mod(req, to_be_send);
+	if (local)
+		_req_mod(req, to_be_submitted);
+
+	/* check this request on the collision detection hash tables.
+	 * if we have a conflict, just complete it here.
+	 * THINK do we want to check reads, too? (I don't think so...) */
+	if (rw == WRITE && _req_conflicts(req)) {
+		/* this is a conflicting request.
+		 * even though it may have been only _partially_
+		 * overlapping with one of the currently pending requests,
+		 * without even submitting or sending it, we will
+		 * pretend that it was successfully served right now.
+		 */
+		if (local) {
+			bio_put(req->private_bio);
+			req->private_bio = NULL;
+			drbd_al_complete_io(mdev, req->sector);
+			put_ldev(mdev);
+			local = 0;
+		}
+		if (remote)
+			dec_ap_pending(mdev);
+		_drbd_end_io_acct(mdev, req);
+		/* THINK: do we want to fail it (-EIO), or pretend success? */
+		bio_endio(req->master_bio, 0);
+		req->master_bio = NULL;
+		dec_ap_bio(mdev);
+		drbd_req_free(req);
+		remote = 0;
+	}
+
+	/* NOTE remote first: to get the concurrent write detection right,
+	 * we must register the request before start of local IO.  */
+	if (remote) {
+		/* either WRITE and C_CONNECTED,
+		 * or READ, and no local disk,
+		 * or READ, but not in sync.
+		 */
+		_req_mod(req, (rw == WRITE)
+				? queue_for_net_write
+				: queue_for_net_read);
+	}
+	spin_unlock_irq(&mdev->req_lock);
+	kfree(b); /* if someone else has beaten us to it... */
+
+	if (local) {
+		req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
+
+		trace_drbd_bio(mdev, "Pri", req->private_bio, 0, NULL);
+
+		if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
+				     : rw == READ  ? DRBD_FAULT_DT_RD
+				     :               DRBD_FAULT_DT_RA))
+			bio_endio(req->private_bio, -EIO);
+		else
+			generic_make_request(req->private_bio);
+	}
+
+	/* we need to plug ALWAYS since we possibly need to kick lo_dev.
+	 * we plug after submit, so we won't miss an unplug event */
+	drbd_plug_device(mdev);
+
+	return 0;
+
+fail_free_complete:
+	if (rw == WRITE && local)
+		drbd_al_complete_io(mdev, sector);
+fail_and_free_req:
+	if (local) {
+		bio_put(req->private_bio);
+		req->private_bio = NULL;
+		put_ldev(mdev);
+	}
+	bio_endio(bio, err);
+	drbd_req_free(req);
+	dec_ap_bio(mdev);
+	kfree(b);
+
+	return 0;
+}
+
+/* helper function for drbd_make_request
+ * if we can determine just by the mdev (state) that this request will fail,
+ * return 1
+ * otherwise return 0
+ */
+static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
+{
+	/* Unconfigured */
+	if (mdev->state.conn == C_DISCONNECTING &&
+	    mdev->state.disk == D_DISKLESS)
+		return 1;
+
+	if (mdev->state.role != R_PRIMARY &&
+		(!allow_oos || is_write)) {
+		if (__ratelimit(&drbd_ratelimit_state)) {
+			dev_err(DEV, "Process %s[%u] tried to %s; "
+			    "since we are not in Primary state, "
+			    "we cannot allow this\n",
+			    current->comm, current->pid,
+			    is_write ? "WRITE" : "READ");
+		}
+		return 1;
+	}
+
+	/*
+	 * Paranoia: we might have been primary, but sync target, or
+	 * even diskless, then lost the connection.
+	 * This should have been handled (panic? suspend?) somewhere
+	 * else. But maybe it was not, so check again here.
+	 * Caution: as long as we do not have a read/write lock on mdev,
+	 * to serialize state changes, this is racy, since we may lose
+	 * the connection *after* we test for the cstate.
+	 */
+	if (mdev->state.disk < D_UP_TO_DATE && mdev->state.pdsk < D_UP_TO_DATE) {
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_err(DEV, "Sorry, I have no access to good data anymore.\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+int drbd_make_request_26(struct request_queue *q, struct bio *bio)
+{
+	unsigned int s_enr, e_enr;
+	struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
+
+	if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) {
+		bio_endio(bio, -EPERM);
+		return 0;
+	}
+
+	/* Reject barrier requests if we know the underlying device does
+	 * not support them.
+	 * XXX: Need to get this info from peer as well some how so we
+	 * XXX: reject if EITHER side/data/metadata area does not support them.
+	 *
+	 * because of those XXX, this is not yet enabled,
+	 * i.e. in drbd_init_set_defaults we set the NO_BARRIER_SUPP bit.
+	 */
+	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && test_bit(NO_BARRIER_SUPP, &mdev->flags))) {
+		/* dev_warn(DEV, "Rejecting barrier request as underlying device does not support\n"); */
+		bio_endio(bio, -EOPNOTSUPP);
+		return 0;
+	}
+
+	/*
+	 * what we "blindly" assume:
+	 */
+	D_ASSERT(bio->bi_size > 0);
+	D_ASSERT((bio->bi_size & 0x1ff) == 0);
+	D_ASSERT(bio->bi_idx == 0);
+
+	/* to make some things easier, force alignment of requests within the
+	 * granularity of our hash tables */
+	s_enr = bio->bi_sector >> HT_SHIFT;
+	e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT;
+
+	if (likely(s_enr == e_enr)) {
+		inc_ap_bio(mdev, 1);
+		return drbd_make_request_common(mdev, bio);
+	}
+
+	/* can this bio be split generically?
+	 * Maybe add our own split-arbitrary-bios function. */
+	if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_SEGMENT_SIZE) {
+		/* rather error out here than BUG in bio_split */
+		dev_err(DEV, "bio would need to, but cannot, be split: "
+		    "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n",
+		    bio->bi_vcnt, bio->bi_idx, bio->bi_size,
+		    (unsigned long long)bio->bi_sector);
+		bio_endio(bio, -EINVAL);
+	} else {
+		/* This bio crosses some boundary, so we have to split it. */
+		struct bio_pair *bp;
+		/* works for the "do not cross hash slot boundaries" case
+		 * e.g. sector 262269, size 4096
+		 * s_enr = 262269 >> 6 = 4097
+		 * e_enr = (262269+8-1) >> 6 = 4098
+		 * HT_SHIFT = 6
+		 * sps = 64, mask = 63
+		 * first_sectors = 64 - (262269 & 63) = 3
+		 */
+		const sector_t sect = bio->bi_sector;
+		const int sps = 1 << HT_SHIFT; /* sectors per slot */
+		const int mask = sps - 1;
+		const sector_t first_sectors = sps - (sect & mask);
+		bp = bio_split(bio,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+				bio_split_pool,
+#endif
+				first_sectors);
+
+		/* we need to get a "reference count" (ap_bio_cnt)
+		 * to avoid races with the disconnect/reconnect/suspend code.
+		 * In case we need to split the bio here, we need to get two references
+		 * atomically, otherwise we might deadlock when trying to submit the
+		 * second one! */
+		inc_ap_bio(mdev, 2);
+
+		D_ASSERT(e_enr == s_enr + 1);
+
+		drbd_make_request_common(mdev, &bp->bio1);
+		drbd_make_request_common(mdev, &bp->bio2);
+		bio_pair_release(bp);
+	}
+	return 0;
+}
+
+/* This is called by bio_add_page().  With this function we reduce
+ * the number of BIOs that span over multiple DRBD_MAX_SEGMENT_SIZEs
+ * units (was AL_EXTENTs).
+ *
+ * we do the calculation within the lower 32bit of the byte offsets,
+ * since we don't care for actual offset, but only check whether it
+ * would cross "activity log extent" boundaries.
+ *
+ * As long as the BIO is empty we have to allow at least one bvec,
+ * regardless of size and offset.  so the resulting bio may still
+ * cross extent boundaries.  those are dealt with (bio_split) in
+ * drbd_make_request_26.
+ */
+int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
+{
+	struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
+	unsigned int bio_offset =
+		(unsigned int)bvm->bi_sector << 9; /* 32 bit */
+	unsigned int bio_size = bvm->bi_size;
+	int limit, backing_limit;
+
+	limit = DRBD_MAX_SEGMENT_SIZE
+	      - ((bio_offset & (DRBD_MAX_SEGMENT_SIZE-1)) + bio_size);
+	if (limit < 0)
+		limit = 0;
+	if (bio_size == 0) {
+		if (limit <= bvec->bv_len)
+			limit = bvec->bv_len;
+	} else if (limit && get_ldev(mdev)) {
+		struct request_queue * const b =
+			mdev->ldev->backing_bdev->bd_disk->queue;
+		if (b->merge_bvec_fn && mdev->ldev->dc.use_bmbv) {
+			backing_limit = b->merge_bvec_fn(b, bvm, bvec);
+			limit = min(limit, backing_limit);
+		}
+		put_ldev(mdev);
+	}
+	return limit;
+}
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
new file mode 100644
index 000000000000..d37ab57f1209
--- /dev/null
+++ b/drivers/block/drbd/drbd_req.h
@@ -0,0 +1,327 @@
+/*
+   drbd_req.h
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2006-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2006-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+   Copyright (C) 2006-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+
+   DRBD is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   DRBD is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _DRBD_REQ_H
+#define _DRBD_REQ_H
+
+#include <linux/autoconf.h>
+#include <linux/module.h>
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+#include "drbd_wrappers.h"
+
+/* The request callbacks will be called in irq context by the IDE drivers,
+   and in Softirqs/Tasklets/BH context by the SCSI drivers,
+   and by the receiver and worker in kernel-thread context.
+   Try to get the locking right :) */
+
+/*
+ * Objects of type struct drbd_request do only exist on a R_PRIMARY node, and are
+ * associated with IO requests originating from the block layer above us.
+ *
+ * There are quite a few things that may happen to a drbd request
+ * during its lifetime.
+ *
+ *  It will be created.
+ *  It will be marked with the intention to be
+ *    submitted to local disk and/or
+ *    send via the network.
+ *
+ *  It has to be placed on the transfer log and other housekeeping lists,
+ *  In case we have a network connection.
+ *
+ *  It may be identified as a concurrent (write) request
+ *    and be handled accordingly.
+ *
+ *  It may me handed over to the local disk subsystem.
+ *  It may be completed by the local disk subsystem,
+ *    either sucessfully or with io-error.
+ *  In case it is a READ request, and it failed locally,
+ *    it may be retried remotely.
+ *
+ *  It may be queued for sending.
+ *  It may be handed over to the network stack,
+ *    which may fail.
+ *  It may be acknowledged by the "peer" according to the wire_protocol in use.
+ *    this may be a negative ack.
+ *  It may receive a faked ack when the network connection is lost and the
+ *  transfer log is cleaned up.
+ *  Sending may be canceled due to network connection loss.
+ *  When it finally has outlived its time,
+ *    corresponding dirty bits in the resync-bitmap may be cleared or set,
+ *    it will be destroyed,
+ *    and completion will be signalled to the originator,
+ *      with or without "success".
+ */
+
+enum drbd_req_event {
+	created,
+	to_be_send,
+	to_be_submitted,
+
+	/* XXX yes, now I am inconsistent...
+	 * these two are not "events" but "actions"
+	 * oh, well... */
+	queue_for_net_write,
+	queue_for_net_read,
+
+	send_canceled,
+	send_failed,
+	handed_over_to_network,
+	connection_lost_while_pending,
+	recv_acked_by_peer,
+	write_acked_by_peer,
+	write_acked_by_peer_and_sis, /* and set_in_sync */
+	conflict_discarded_by_peer,
+	neg_acked,
+	barrier_acked, /* in protocol A and B */
+	data_received, /* (remote read) */
+
+	read_completed_with_error,
+	read_ahead_completed_with_error,
+	write_completed_with_error,
+	completed_ok,
+	nothing, /* for tracing only */
+};
+
+/* encoding of request states for now.  we don't actually need that many bits.
+ * we don't need to do atomic bit operations either, since most of the time we
+ * need to look at the connection state and/or manipulate some lists at the
+ * same time, so we should hold the request lock anyways.
+ */
+enum drbd_req_state_bits {
+	/* 210
+	 * 000: no local possible
+	 * 001: to be submitted
+	 *    UNUSED, we could map: 011: submitted, completion still pending
+	 * 110: completed ok
+	 * 010: completed with error
+	 */
+	__RQ_LOCAL_PENDING,
+	__RQ_LOCAL_COMPLETED,
+	__RQ_LOCAL_OK,
+
+	/* 76543
+	 * 00000: no network possible
+	 * 00001: to be send
+	 * 00011: to be send, on worker queue
+	 * 00101: sent, expecting recv_ack (B) or write_ack (C)
+	 * 11101: sent,
+	 *        recv_ack (B) or implicit "ack" (A),
+	 *        still waiting for the barrier ack.
+	 *        master_bio may already be completed and invalidated.
+	 * 11100: write_acked (C),
+	 *        data_received (for remote read, any protocol)
+	 *        or finally the barrier ack has arrived (B,A)...
+	 *        request can be freed
+	 * 01100: neg-acked (write, protocol C)
+	 *        or neg-d-acked (read, any protocol)
+	 *        or killed from the transfer log
+	 *        during cleanup after connection loss
+	 *        request can be freed
+	 * 01000: canceled or send failed...
+	 *        request can be freed
+	 */
+
+	/* if "SENT" is not set, yet, this can still fail or be canceled.
+	 * if "SENT" is set already, we still wait for an Ack packet.
+	 * when cleared, the master_bio may be completed.
+	 * in (B,A) the request object may still linger on the transaction log
+	 * until the corresponding barrier ack comes in */
+	__RQ_NET_PENDING,
+
+	/* If it is QUEUED, and it is a WRITE, it is also registered in the
+	 * transfer log. Currently we need this flag to avoid conflicts between
+	 * worker canceling the request and tl_clear_barrier killing it from
+	 * transfer log.  We should restructure the code so this conflict does
+	 * no longer occur. */
+	__RQ_NET_QUEUED,
+
+	/* well, actually only "handed over to the network stack".
+	 *
+	 * TODO can potentially be dropped because of the similar meaning
+	 * of RQ_NET_SENT and ~RQ_NET_QUEUED.
+	 * however it is not exactly the same. before we drop it
+	 * we must ensure that we can tell a request with network part
+	 * from a request without, regardless of what happens to it. */
+	__RQ_NET_SENT,
+
+	/* when set, the request may be freed (if RQ_NET_QUEUED is clear).
+	 * basically this means the corresponding P_BARRIER_ACK was received */
+	__RQ_NET_DONE,
+
+	/* whether or not we know (C) or pretend (B,A) that the write
+	 * was successfully written on the peer.
+	 */
+	__RQ_NET_OK,
+
+	/* peer called drbd_set_in_sync() for this write */
+	__RQ_NET_SIS,
+
+	/* keep this last, its for the RQ_NET_MASK */
+	__RQ_NET_MAX,
+};
+
+#define RQ_LOCAL_PENDING   (1UL << __RQ_LOCAL_PENDING)
+#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED)
+#define RQ_LOCAL_OK        (1UL << __RQ_LOCAL_OK)
+
+#define RQ_LOCAL_MASK      ((RQ_LOCAL_OK << 1)-1) /* 0x07 */
+
+#define RQ_NET_PENDING     (1UL << __RQ_NET_PENDING)
+#define RQ_NET_QUEUED      (1UL << __RQ_NET_QUEUED)
+#define RQ_NET_SENT        (1UL << __RQ_NET_SENT)
+#define RQ_NET_DONE        (1UL << __RQ_NET_DONE)
+#define RQ_NET_OK          (1UL << __RQ_NET_OK)
+#define RQ_NET_SIS         (1UL << __RQ_NET_SIS)
+
+/* 0x1f8 */
+#define RQ_NET_MASK        (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
+
+/* epoch entries */
+static inline
+struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector)
+{
+	BUG_ON(mdev->ee_hash_s == 0);
+	return mdev->ee_hash +
+		((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s);
+}
+
+/* transfer log (drbd_request objects) */
+static inline
+struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector)
+{
+	BUG_ON(mdev->tl_hash_s == 0);
+	return mdev->tl_hash +
+		((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s);
+}
+
+/* application reads (drbd_request objects) */
+static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector)
+{
+	return mdev->app_reads_hash
+		+ ((unsigned int)(sector) % APP_R_HSIZE);
+}
+
+/* when we receive the answer for a read request,
+ * verify that we actually know about it */
+static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
+	u64 id, sector_t sector)
+{
+	struct hlist_head *slot = ar_hash_slot(mdev, sector);
+	struct hlist_node *n;
+	struct drbd_request *req;
+
+	hlist_for_each_entry(req, n, slot, colision) {
+		if ((unsigned long)req == (unsigned long)id) {
+			D_ASSERT(req->sector == sector);
+			return req;
+		}
+	}
+	return NULL;
+}
+
+static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
+	struct bio *bio_src)
+{
+	struct bio *bio;
+	struct drbd_request *req =
+		mempool_alloc(drbd_request_mempool, GFP_NOIO);
+	if (likely(req)) {
+		bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
+
+		req->rq_state    = 0;
+		req->mdev        = mdev;
+		req->master_bio  = bio_src;
+		req->private_bio = bio;
+		req->epoch       = 0;
+		req->sector      = bio->bi_sector;
+		req->size        = bio->bi_size;
+		req->start_time  = jiffies;
+		INIT_HLIST_NODE(&req->colision);
+		INIT_LIST_HEAD(&req->tl_requests);
+		INIT_LIST_HEAD(&req->w.list);
+
+		bio->bi_private  = req;
+		bio->bi_end_io   = drbd_endio_pri;
+		bio->bi_next     = NULL;
+	}
+	return req;
+}
+
+static inline void drbd_req_free(struct drbd_request *req)
+{
+	mempool_free(req, drbd_request_mempool);
+}
+
+static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
+{
+	return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
+}
+
+/* Short lived temporary struct on the stack.
+ * We could squirrel the error to be returned into
+ * bio->bi_size, or similar. But that would be too ugly. */
+struct bio_and_error {
+	struct bio *bio;
+	int error;
+};
+
+extern void _req_may_be_done(struct drbd_request *req,
+		struct bio_and_error *m);
+extern void __req_mod(struct drbd_request *req, enum drbd_req_event what,
+		struct bio_and_error *m);
+extern void complete_master_bio(struct drbd_conf *mdev,
+		struct bio_and_error *m);
+
+/* use this if you don't want to deal with calling complete_master_bio()
+ * outside the spinlock, e.g. when walking some list on cleanup. */
+static inline void _req_mod(struct drbd_request *req, enum drbd_req_event what)
+{
+	struct drbd_conf *mdev = req->mdev;
+	struct bio_and_error m;
+
+	/* __req_mod possibly frees req, do not touch req after that! */
+	__req_mod(req, what, &m);
+	if (m.bio)
+		complete_master_bio(mdev, &m);
+}
+
+/* completion of master bio is outside of spinlock.
+ * If you need it irqsave, do it your self! */
+static inline void req_mod(struct drbd_request *req,
+		enum drbd_req_event what)
+{
+	struct drbd_conf *mdev = req->mdev;
+	struct bio_and_error m;
+	spin_lock_irq(&mdev->req_lock);
+	__req_mod(req, what, &m);
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (m.bio)
+		complete_master_bio(mdev, &m);
+}
+#endif
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
new file mode 100644
index 000000000000..76863e3f05be
--- /dev/null
+++ b/drivers/block/drbd/drbd_strings.c
@@ -0,0 +1,113 @@
+/*
+  drbd.h
+
+  This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+  Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+  Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+  Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+  drbd is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2, or (at your option)
+  any later version.
+
+  drbd is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with drbd; see the file COPYING.  If not, write to
+  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#include <linux/drbd.h>
+
+static const char *drbd_conn_s_names[] = {
+	[C_STANDALONE]       = "StandAlone",
+	[C_DISCONNECTING]    = "Disconnecting",
+	[C_UNCONNECTED]      = "Unconnected",
+	[C_TIMEOUT]          = "Timeout",
+	[C_BROKEN_PIPE]      = "BrokenPipe",
+	[C_NETWORK_FAILURE]  = "NetworkFailure",
+	[C_PROTOCOL_ERROR]   = "ProtocolError",
+	[C_WF_CONNECTION]    = "WFConnection",
+	[C_WF_REPORT_PARAMS] = "WFReportParams",
+	[C_TEAR_DOWN]        = "TearDown",
+	[C_CONNECTED]        = "Connected",
+	[C_STARTING_SYNC_S]  = "StartingSyncS",
+	[C_STARTING_SYNC_T]  = "StartingSyncT",
+	[C_WF_BITMAP_S]      = "WFBitMapS",
+	[C_WF_BITMAP_T]      = "WFBitMapT",
+	[C_WF_SYNC_UUID]     = "WFSyncUUID",
+	[C_SYNC_SOURCE]      = "SyncSource",
+	[C_SYNC_TARGET]      = "SyncTarget",
+	[C_PAUSED_SYNC_S]    = "PausedSyncS",
+	[C_PAUSED_SYNC_T]    = "PausedSyncT",
+	[C_VERIFY_S]         = "VerifyS",
+	[C_VERIFY_T]         = "VerifyT",
+};
+
+static const char *drbd_role_s_names[] = {
+	[R_PRIMARY]   = "Primary",
+	[R_SECONDARY] = "Secondary",
+	[R_UNKNOWN]   = "Unknown"
+};
+
+static const char *drbd_disk_s_names[] = {
+	[D_DISKLESS]     = "Diskless",
+	[D_ATTACHING]    = "Attaching",
+	[D_FAILED]       = "Failed",
+	[D_NEGOTIATING]  = "Negotiating",
+	[D_INCONSISTENT] = "Inconsistent",
+	[D_OUTDATED]     = "Outdated",
+	[D_UNKNOWN]      = "DUnknown",
+	[D_CONSISTENT]   = "Consistent",
+	[D_UP_TO_DATE]   = "UpToDate",
+};
+
+static const char *drbd_state_sw_errors[] = {
+	[-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
+	[-SS_NO_UP_TO_DATE_DISK] = "Refusing to be Primary without at least one UpToDate disk",
+	[-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
+	[-SS_NO_REMOTE_DISK] = "Can not resync without remote disk",
+	[-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected",
+	[-SS_PRIMARY_NOP] = "Refusing to be Primary while peer is not outdated",
+	[-SS_RESYNC_RUNNING] = "Can not start OV/resync since it is already active",
+	[-SS_ALREADY_STANDALONE] = "Can not disconnect a StandAlone device",
+	[-SS_CW_FAILED_BY_PEER] = "State change was refused by peer node",
+	[-SS_IS_DISKLESS] = "Device is diskless, the requested operation requires a disk",
+	[-SS_DEVICE_IN_USE] = "Device is held open by someone",
+	[-SS_NO_NET_CONFIG] = "Have no net/connection configuration",
+	[-SS_NO_VERIFY_ALG] = "Need a verify algorithm to start online verify",
+	[-SS_NEED_CONNECTION] = "Need a connection to start verify or resync",
+	[-SS_NOT_SUPPORTED] = "Peer does not support protocol",
+	[-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
+	[-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
+	[-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
+};
+
+const char *drbd_conn_str(enum drbd_conns s)
+{
+	/* enums are unsigned... */
+	return s > C_PAUSED_SYNC_T ? "TOO_LARGE" : drbd_conn_s_names[s];
+}
+
+const char *drbd_role_str(enum drbd_role s)
+{
+	return s > R_SECONDARY   ? "TOO_LARGE" : drbd_role_s_names[s];
+}
+
+const char *drbd_disk_str(enum drbd_disk_state s)
+{
+	return s > D_UP_TO_DATE    ? "TOO_LARGE" : drbd_disk_s_names[s];
+}
+
+const char *drbd_set_st_err_str(enum drbd_state_ret_codes err)
+{
+	return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" :
+	       err > SS_TWO_PRIMARIES ? "TOO_LARGE"
+			: drbd_state_sw_errors[-err];
+}
diff --git a/drivers/block/drbd/drbd_tracing.c b/drivers/block/drbd/drbd_tracing.c
new file mode 100644
index 000000000000..d18d4f7b4bef
--- /dev/null
+++ b/drivers/block/drbd/drbd_tracing.c
@@ -0,0 +1,752 @@
+/*
+   drbd_tracing.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/module.h>
+#include <linux/drbd.h>
+#include <linux/ctype.h>
+#include "drbd_int.h"
+#include "drbd_tracing.h"
+#include <linux/drbd_tag_magic.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Philipp Reisner, Lars Ellenberg");
+MODULE_DESCRIPTION("DRBD tracepoint probes");
+MODULE_PARM_DESC(trace_mask, "Bitmap of events to trace see drbd_tracing.c");
+MODULE_PARM_DESC(trace_level, "Current tracing level (changeable in /sys)");
+MODULE_PARM_DESC(trace_devs, "Bitmap of devices to trace (changeable in /sys)");
+
+unsigned int trace_mask = 0;  /* Bitmap of events to trace */
+int trace_level;              /* Current trace level */
+int trace_devs;		      /* Bitmap of devices to trace */
+
+module_param(trace_mask, uint, 0444);
+module_param(trace_level, int, 0644);
+module_param(trace_devs, int, 0644);
+
+enum {
+	TRACE_PACKET  = 0x0001,
+	TRACE_RQ      = 0x0002,
+	TRACE_UUID    = 0x0004,
+	TRACE_RESYNC  = 0x0008,
+	TRACE_EE      = 0x0010,
+	TRACE_UNPLUG  = 0x0020,
+	TRACE_NL      = 0x0040,
+	TRACE_AL_EXT  = 0x0080,
+	TRACE_INT_RQ  = 0x0100,
+	TRACE_MD_IO   = 0x0200,
+	TRACE_EPOCH   = 0x0400,
+};
+
+/* Buffer printing support
+ * dbg_print_flags: used for Flags arg to drbd_print_buffer
+ * - DBGPRINT_BUFFADDR; if set, each line starts with the
+ *	 virtual address of the line being output. If clear,
+ *	 each line starts with the offset from the beginning
+ *	 of the buffer. */
+enum dbg_print_flags {
+    DBGPRINT_BUFFADDR = 0x0001,
+};
+
+/* Macro stuff */
+static char *nl_packet_name(int packet_type)
+{
+/* Generate packet type strings */
+#define NL_PACKET(name, number, fields) \
+	[P_ ## name] = # name,
+#define NL_INTEGER Argh!
+#define NL_BIT Argh!
+#define NL_INT64 Argh!
+#define NL_STRING Argh!
+
+	static char *nl_tag_name[P_nl_after_last_packet] = {
+#include "linux/drbd_nl.h"
+	};
+
+	return (packet_type < sizeof(nl_tag_name)/sizeof(nl_tag_name[0])) ?
+	    nl_tag_name[packet_type] : "*Unknown*";
+}
+/* /Macro stuff */
+
+static inline int is_mdev_trace(struct drbd_conf *mdev, unsigned int level)
+{
+	return trace_level >= level && ((1 << mdev_to_minor(mdev)) & trace_devs);
+}
+
+static void probe_drbd_unplug(struct drbd_conf *mdev, char *msg)
+{
+	if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
+		return;
+
+	dev_info(DEV, "%s, ap_bio_count=%d\n", msg, atomic_read(&mdev->ap_bio_cnt));
+}
+
+static void probe_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index)
+{
+	static char *uuid_str[UI_EXTENDED_SIZE] = {
+		[UI_CURRENT] = "CURRENT",
+		[UI_BITMAP] = "BITMAP",
+		[UI_HISTORY_START] = "HISTORY_START",
+		[UI_HISTORY_END] = "HISTORY_END",
+		[UI_SIZE] = "SIZE",
+		[UI_FLAGS] = "FLAGS",
+	};
+
+	if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
+		return;
+
+	if (index >= UI_EXTENDED_SIZE) {
+		dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n");
+		return;
+	}
+
+	dev_info(DEV, " uuid[%s] now %016llX\n",
+		 uuid_str[index],
+		 (unsigned long long)mdev->ldev->md.uuid[index]);
+}
+
+static void probe_drbd_md_io(struct drbd_conf *mdev, int rw,
+			     struct drbd_backing_dev *bdev)
+{
+	if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
+		return;
+
+	dev_info(DEV, " %s metadata superblock now\n",
+		 rw == READ ? "Reading" : "Writing");
+}
+
+static void probe_drbd_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, char* msg)
+{
+	if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
+		return;
+
+	dev_info(DEV, "EE %s sec=%llus size=%u e=%p\n",
+		 msg, (unsigned long long)e->sector, e->size, e);
+}
+
+static void probe_drbd_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch,
+			     enum epoch_event ev)
+{
+	static char *epoch_event_str[] = {
+		[EV_PUT] = "put",
+		[EV_GOT_BARRIER_NR] = "got_barrier_nr",
+		[EV_BARRIER_DONE] = "barrier_done",
+		[EV_BECAME_LAST] = "became_last",
+		[EV_TRACE_FLUSH] = "issuing_flush",
+		[EV_TRACE_ADD_BARRIER] = "added_barrier",
+		[EV_TRACE_SETTING_BI] = "just set barrier_in_next_epoch",
+	};
+
+	if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
+		return;
+
+	ev &= ~EV_CLEANUP;
+
+	switch (ev) {
+	case EV_TRACE_ALLOC:
+		dev_info(DEV, "Allocate epoch %p/xxxx { } nr_epochs=%d\n", epoch, mdev->epochs);
+		break;
+	case EV_TRACE_FREE:
+		dev_info(DEV, "Freeing epoch %p/%d { size=%d } nr_epochs=%d\n",
+			 epoch, epoch->barrier_nr, atomic_read(&epoch->epoch_size),
+			 mdev->epochs);
+		break;
+	default:
+		dev_info(DEV, "Update epoch  %p/%d { size=%d active=%d %c%c n%c%c } ev=%s\n",
+			 epoch, epoch->barrier_nr, atomic_read(&epoch->epoch_size),
+			 atomic_read(&epoch->active),
+			 test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) ? 'n' : '-',
+			 test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) ? 'b' : '-',
+			 test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) ? 'i' : '-',
+			 test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ? 'd' : '-',
+			 epoch_event_str[ev]);
+	}
+}
+
+static void probe_drbd_netlink(void *data, int is_req)
+{
+	struct cn_msg *msg = data;
+
+	if (is_req) {
+		struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)msg->data;
+
+		printk(KERN_INFO "drbd%d: "
+			 "Netlink: << %s (%d) - seq: %x, ack: %x, len: %x\n",
+			 nlp->drbd_minor,
+			 nl_packet_name(nlp->packet_type),
+			 nlp->packet_type,
+			 msg->seq, msg->ack, msg->len);
+	} else {
+		struct drbd_nl_cfg_reply *nlp = (struct drbd_nl_cfg_reply *)msg->data;
+
+		printk(KERN_INFO "drbd%d: "
+		       "Netlink: >> %s (%d) - seq: %x, ack: %x, len: %x\n",
+		       nlp->minor,
+		       nlp->packet_type == P_nl_after_last_packet ?
+		       "Empty-Reply" : nl_packet_name(nlp->packet_type),
+		       nlp->packet_type,
+		       msg->seq, msg->ack, msg->len);
+	}
+}
+
+static void probe_drbd_actlog(struct drbd_conf *mdev, sector_t sector, char* msg)
+{
+	unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
+
+	if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
+		return;
+
+	dev_info(DEV, "%s (sec=%llus, al_enr=%u, rs_enr=%d)\n",
+		 msg, (unsigned long long) sector, enr,
+		 (int)BM_SECT_TO_EXT(sector));
+}
+
+/**
+ * drbd_print_buffer() - Hexdump arbitrary binary data into a buffer
+ * @prefix:	String is output at the beginning of each line output.
+ * @flags:	Currently only defined flag: DBGPRINT_BUFFADDR; if set, each
+ *		line starts with the virtual address of the line being
+ *		output. If clear, each line starts with the offset from the
+ *		beginning of the buffer.
+ * @size:	Indicates the size of each entry in the buffer. Supported
+ * 		values are sizeof(char), sizeof(short) and sizeof(int)
+ * @buffer:	Start address of buffer
+ * @buffer_va:	Virtual address of start of buffer (normally the same
+ *		as Buffer, but having it separate allows it to hold
+ *		file address for example)
+ * @length:	length of buffer
+ */
+static void drbd_print_buffer(const char *prefix, unsigned int flags, int size,
+			      const void *buffer, const void *buffer_va,
+			      unsigned int length)
+
+#define LINE_SIZE       16
+#define LINE_ENTRIES    (int)(LINE_SIZE/size)
+{
+	const unsigned char *pstart;
+	const unsigned char *pstart_va;
+	const unsigned char *pend;
+	char bytes_str[LINE_SIZE*3+8], ascii_str[LINE_SIZE+8];
+	char *pbytes = bytes_str, *pascii = ascii_str;
+	int  offset = 0;
+	long sizemask;
+	int  field_width;
+	int  index;
+	const unsigned char *pend_str;
+	const unsigned char *p;
+	int count;
+
+	/* verify size parameter */
+	if (size != sizeof(char) &&
+	    size != sizeof(short) &&
+	    size != sizeof(int)) {
+		printk(KERN_DEBUG "drbd_print_buffer: "
+			"ERROR invalid size %d\n", size);
+		return;
+	}
+
+	sizemask = size-1;
+	field_width = size*2;
+
+	/* Adjust start/end to be on appropriate boundary for size */
+	buffer = (const char *)((long)buffer & ~sizemask);
+	pend   = (const unsigned char *)
+		(((long)buffer + length + sizemask) & ~sizemask);
+
+	if (flags & DBGPRINT_BUFFADDR) {
+		/* Move start back to nearest multiple of line size,
+		 * if printing address. This results in nicely formatted output
+		 * with addresses being on line size (16) byte boundaries */
+		pstart = (const unsigned char *)((long)buffer & ~(LINE_SIZE-1));
+	} else {
+		pstart = (const unsigned char *)buffer;
+	}
+
+	/* Set value of start VA to print if addresses asked for */
+	pstart_va = (const unsigned char *)buffer_va
+		 - ((const unsigned char *)buffer-pstart);
+
+	/* Calculate end position to nicely align right hand side */
+	pend_str = pstart + (((pend-pstart) + LINE_SIZE-1) & ~(LINE_SIZE-1));
+
+	/* Init strings */
+	*pbytes = *pascii = '\0';
+
+	/* Start at beginning of first line */
+	p = pstart;
+	count = 0;
+
+	while (p < pend_str) {
+		if (p < (const unsigned char *)buffer || p >= pend) {
+			/* Before start of buffer or after end- print spaces */
+			pbytes += sprintf(pbytes, "%*c ", field_width, ' ');
+			pascii += sprintf(pascii, "%*c", size, ' ');
+			p += size;
+		} else {
+			/* Add hex and ascii to strings */
+			int val;
+			switch (size) {
+			default:
+			case 1:
+				val = *(unsigned char *)p;
+				break;
+			case 2:
+				val = *(unsigned short *)p;
+				break;
+			case 4:
+				val = *(unsigned int *)p;
+				break;
+			}
+
+			pbytes += sprintf(pbytes, "%0*x ", field_width, val);
+
+			for (index = size; index; index--) {
+				*pascii++ = isprint(*p) ? *p : '.';
+				p++;
+			}
+		}
+
+		count++;
+
+		if (count == LINE_ENTRIES || p >= pend_str) {
+			/* Null terminate and print record */
+			*pascii = '\0';
+			printk(KERN_DEBUG "%s%8.8lx: %*s|%*s|\n",
+			       prefix,
+			       (flags & DBGPRINT_BUFFADDR)
+			       ? (long)pstart_va:(long)offset,
+			       LINE_ENTRIES*(field_width+1), bytes_str,
+			       LINE_SIZE, ascii_str);
+
+			/* Move onto next line */
+			pstart_va += (p-pstart);
+			pstart = p;
+			count  = 0;
+			offset += LINE_SIZE;
+
+			/* Re-init strings */
+			pbytes = bytes_str;
+			pascii = ascii_str;
+			*pbytes = *pascii = '\0';
+		}
+	}
+}
+
+static void probe_drbd_resync(struct drbd_conf *mdev, int level, const char *fmt, va_list args)
+{
+	char str[256];
+
+	if (!is_mdev_trace(mdev, level))
+		return;
+
+	if (vsnprintf(str, 256, fmt, args) >= 256)
+		str[255] = 0;
+
+	printk(KERN_INFO "%s %s: %s", dev_driver_string(disk_to_dev(mdev->vdisk)),
+	       dev_name(disk_to_dev(mdev->vdisk)), str);
+}
+
+static void probe_drbd_bio(struct drbd_conf *mdev, const char *pfx, struct bio *bio, int complete,
+			   struct drbd_request *r)
+{
+#if defined(CONFIG_LBDAF) || defined(CONFIG_LBD)
+#define SECTOR_FORMAT "%Lx"
+#else
+#define SECTOR_FORMAT "%lx"
+#endif
+#define SECTOR_SHIFT 9
+
+	unsigned long lowaddr = (unsigned long)(bio->bi_sector << SECTOR_SHIFT);
+	char *faddr = (char *)(lowaddr);
+	char rb[sizeof(void *)*2+6] = { 0, };
+	struct bio_vec *bvec;
+	int segno;
+
+	const int rw = bio->bi_rw;
+	const int biorw      = (rw & (RW_MASK|RWA_MASK));
+	const int biobarrier = (rw & (1<<BIO_RW_BARRIER));
+	const int biosync = (rw & ((1<<BIO_RW_UNPLUG) | (1<<BIO_RW_SYNCIO)));
+
+	if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
+		return;
+
+	if (r)
+		sprintf(rb, "Req:%p ", r);
+
+	dev_info(DEV, "%s %s:%s%s%s Bio:%p %s- %soffset " SECTOR_FORMAT ", size %x\n",
+		 complete ? "<<<" : ">>>",
+		 pfx,
+		 biorw == WRITE ? "Write" : "Read",
+		 biobarrier ? " : B" : "",
+		 biosync ? " : S" : "",
+		 bio,
+		 rb,
+		 complete ? (bio_flagged(bio, BIO_UPTODATE) ? "Success, " : "Failed, ") : "",
+		 bio->bi_sector << SECTOR_SHIFT,
+		 bio->bi_size);
+
+	if (trace_level >= TRACE_LVL_METRICS &&
+	    ((biorw == WRITE) ^ complete)) {
+		printk(KERN_DEBUG "  ind     page   offset   length\n");
+		__bio_for_each_segment(bvec, bio, segno, 0) {
+			printk(KERN_DEBUG "  [%d] %p %8.8x %8.8x\n", segno,
+			       bvec->bv_page, bvec->bv_offset, bvec->bv_len);
+
+			if (trace_level >= TRACE_LVL_ALL) {
+				char *bvec_buf;
+				unsigned long flags;
+
+				bvec_buf = bvec_kmap_irq(bvec, &flags);
+
+				drbd_print_buffer("    ", DBGPRINT_BUFFADDR, 1,
+						  bvec_buf,
+						  faddr,
+						  (bvec->bv_len <= 0x80)
+						  ? bvec->bv_len : 0x80);
+
+				bvec_kunmap_irq(bvec_buf, &flags);
+
+				if (bvec->bv_len > 0x40)
+					printk(KERN_DEBUG "    ....\n");
+
+				faddr += bvec->bv_len;
+			}
+		}
+	}
+}
+
+static void probe_drbd_req(struct drbd_request *req, enum drbd_req_event what, char *msg)
+{
+	static const char *rq_event_names[] = {
+		[created] = "created",
+		[to_be_send] = "to_be_send",
+		[to_be_submitted] = "to_be_submitted",
+		[queue_for_net_write] = "queue_for_net_write",
+		[queue_for_net_read] = "queue_for_net_read",
+		[send_canceled] = "send_canceled",
+		[send_failed] = "send_failed",
+		[handed_over_to_network] = "handed_over_to_network",
+		[connection_lost_while_pending] =
+					"connection_lost_while_pending",
+		[recv_acked_by_peer] = "recv_acked_by_peer",
+		[write_acked_by_peer] = "write_acked_by_peer",
+		[neg_acked] = "neg_acked",
+		[conflict_discarded_by_peer] = "conflict_discarded_by_peer",
+		[barrier_acked] = "barrier_acked",
+		[data_received] = "data_received",
+		[read_completed_with_error] = "read_completed_with_error",
+		[read_ahead_completed_with_error] = "reada_completed_with_error",
+		[write_completed_with_error] = "write_completed_with_error",
+		[completed_ok] = "completed_ok",
+	};
+
+	struct drbd_conf *mdev = req->mdev;
+
+	const int rw = (req->master_bio == NULL ||
+			bio_data_dir(req->master_bio) == WRITE) ?
+		'W' : 'R';
+	const unsigned long s = req->rq_state;
+
+	if (what != nothing) {
+		dev_info(DEV, "__req_mod(%p %c ,%s)\n", req, rw, rq_event_names[what]);
+	} else {
+		dev_info(DEV, "%s %p %c L%c%c%cN%c%c%c%c%c %u (%llus +%u) %s\n",
+			 msg, req, rw,
+			 s & RQ_LOCAL_PENDING ? 'p' : '-',
+			 s & RQ_LOCAL_COMPLETED ? 'c' : '-',
+			 s & RQ_LOCAL_OK ? 'o' : '-',
+			 s & RQ_NET_PENDING ? 'p' : '-',
+			 s & RQ_NET_QUEUED ? 'q' : '-',
+			 s & RQ_NET_SENT ? 's' : '-',
+			 s & RQ_NET_DONE ? 'd' : '-',
+			 s & RQ_NET_OK ? 'o' : '-',
+			 req->epoch,
+			 (unsigned long long)req->sector,
+			 req->size,
+			 drbd_conn_str(mdev->state.conn));
+	}
+}
+
+
+#define drbd_peer_str drbd_role_str
+#define drbd_pdsk_str drbd_disk_str
+
+#define PSM(A)							\
+do {								\
+	if (mask.A) {						\
+		int i = snprintf(p, len, " " #A "( %s )",	\
+				 drbd_##A##_str(val.A));	\
+		if (i >= len)					\
+			return op;				\
+		p += i;						\
+		len -= i;					\
+	}							\
+} while (0)
+
+static char *dump_st(char *p, int len, union drbd_state mask, union drbd_state val)
+{
+	char *op = p;
+	*p = '\0';
+	PSM(role);
+	PSM(peer);
+	PSM(conn);
+	PSM(disk);
+	PSM(pdsk);
+
+	return op;
+}
+
+#define INFOP(fmt, args...) \
+do { \
+	if (trace_level >= TRACE_LVL_ALL) { \
+		dev_info(DEV, "%s:%d: %s [%d] %s %s " fmt , \
+		     file, line, current->comm, current->pid, \
+		     sockname, recv ? "<<<" : ">>>" , \
+		     ## args); \
+	} else { \
+		dev_info(DEV, "%s %s " fmt, sockname, \
+		     recv ? "<<<" : ">>>" , \
+		     ## args); \
+	} \
+} while (0)
+
+static char *_dump_block_id(u64 block_id, char *buff)
+{
+	if (is_syncer_block_id(block_id))
+		strcpy(buff, "SyncerId");
+	else
+		sprintf(buff, "%llx", (unsigned long long)block_id);
+
+	return buff;
+}
+
+static void probe_drbd_packet(struct drbd_conf *mdev, struct socket *sock,
+			      int recv, union p_polymorph *p, char *file, int line)
+{
+	char *sockname = sock == mdev->meta.socket ? "meta" : "data";
+	int cmd = (recv == 2) ? p->header.command : be16_to_cpu(p->header.command);
+	char tmp[300];
+	union drbd_state m, v;
+
+	switch (cmd) {
+	case P_HAND_SHAKE:
+		INFOP("%s (protocol %u-%u)\n", cmdname(cmd),
+			be32_to_cpu(p->handshake.protocol_min),
+			be32_to_cpu(p->handshake.protocol_max));
+		break;
+
+	case P_BITMAP: /* don't report this */
+	case P_COMPRESSED_BITMAP: /* don't report this */
+		break;
+
+	case P_DATA:
+		INFOP("%s (sector %llus, id %s, seq %u, f %x)\n", cmdname(cmd),
+		      (unsigned long long)be64_to_cpu(p->data.sector),
+		      _dump_block_id(p->data.block_id, tmp),
+		      be32_to_cpu(p->data.seq_num),
+		      be32_to_cpu(p->data.dp_flags)
+			);
+		break;
+
+	case P_DATA_REPLY:
+	case P_RS_DATA_REPLY:
+		INFOP("%s (sector %llus, id %s)\n", cmdname(cmd),
+		      (unsigned long long)be64_to_cpu(p->data.sector),
+		      _dump_block_id(p->data.block_id, tmp)
+			);
+		break;
+
+	case P_RECV_ACK:
+	case P_WRITE_ACK:
+	case P_RS_WRITE_ACK:
+	case P_DISCARD_ACK:
+	case P_NEG_ACK:
+	case P_NEG_RS_DREPLY:
+		INFOP("%s (sector %llus, size %u, id %s, seq %u)\n",
+			cmdname(cmd),
+		      (long long)be64_to_cpu(p->block_ack.sector),
+		      be32_to_cpu(p->block_ack.blksize),
+		      _dump_block_id(p->block_ack.block_id, tmp),
+		      be32_to_cpu(p->block_ack.seq_num)
+			);
+		break;
+
+	case P_DATA_REQUEST:
+	case P_RS_DATA_REQUEST:
+		INFOP("%s (sector %llus, size %u, id %s)\n", cmdname(cmd),
+		      (long long)be64_to_cpu(p->block_req.sector),
+		      be32_to_cpu(p->block_req.blksize),
+		      _dump_block_id(p->block_req.block_id, tmp)
+			);
+		break;
+
+	case P_BARRIER:
+	case P_BARRIER_ACK:
+		INFOP("%s (barrier %u)\n", cmdname(cmd), p->barrier.barrier);
+		break;
+
+	case P_SYNC_PARAM:
+	case P_SYNC_PARAM89:
+		INFOP("%s (rate %u, verify-alg \"%.64s\", csums-alg \"%.64s\")\n",
+			cmdname(cmd), be32_to_cpu(p->rs_param_89.rate),
+			p->rs_param_89.verify_alg, p->rs_param_89.csums_alg);
+		break;
+
+	case P_UUIDS:
+		INFOP("%s Curr:%016llX, Bitmap:%016llX, "
+		      "HisSt:%016llX, HisEnd:%016llX\n",
+		      cmdname(cmd),
+		      (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_CURRENT]),
+		      (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_BITMAP]),
+		      (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_HISTORY_START]),
+		      (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_HISTORY_END]));
+		break;
+
+	case P_SIZES:
+		INFOP("%s (d %lluMiB, u %lluMiB, c %lldMiB, "
+		      "max bio %x, q order %x)\n",
+		      cmdname(cmd),
+		      (long long)(be64_to_cpu(p->sizes.d_size)>>(20-9)),
+		      (long long)(be64_to_cpu(p->sizes.u_size)>>(20-9)),
+		      (long long)(be64_to_cpu(p->sizes.c_size)>>(20-9)),
+		      be32_to_cpu(p->sizes.max_segment_size),
+		      be32_to_cpu(p->sizes.queue_order_type));
+		break;
+
+	case P_STATE:
+		v.i = be32_to_cpu(p->state.state);
+		m.i = 0xffffffff;
+		dump_st(tmp, sizeof(tmp), m, v);
+		INFOP("%s (s %x {%s})\n", cmdname(cmd), v.i, tmp);
+		break;
+
+	case P_STATE_CHG_REQ:
+		m.i = be32_to_cpu(p->req_state.mask);
+		v.i = be32_to_cpu(p->req_state.val);
+		dump_st(tmp, sizeof(tmp), m, v);
+		INFOP("%s (m %x v %x {%s})\n", cmdname(cmd), m.i, v.i, tmp);
+		break;
+
+	case P_STATE_CHG_REPLY:
+		INFOP("%s (ret %x)\n", cmdname(cmd),
+		      be32_to_cpu(p->req_state_reply.retcode));
+		break;
+
+	case P_PING:
+	case P_PING_ACK:
+		/*
+		 * Dont trace pings at summary level
+		 */
+		if (trace_level < TRACE_LVL_ALL)
+			break;
+		/* fall through... */
+	default:
+		INFOP("%s (%u)\n", cmdname(cmd), cmd);
+		break;
+	}
+}
+
+
+static int __init drbd_trace_init(void)
+{
+	int ret;
+
+	if (trace_mask & TRACE_UNPLUG) {
+		ret = register_trace_drbd_unplug(probe_drbd_unplug);
+		WARN_ON(ret);
+	}
+	if (trace_mask & TRACE_UUID) {
+		ret = register_trace_drbd_uuid(probe_drbd_uuid);
+		WARN_ON(ret);
+	}
+	if (trace_mask & TRACE_EE) {
+		ret = register_trace_drbd_ee(probe_drbd_ee);
+		WARN_ON(ret);
+	}
+	if (trace_mask & TRACE_PACKET) {
+		ret = register_trace_drbd_packet(probe_drbd_packet);
+		WARN_ON(ret);
+	}
+	if (trace_mask & TRACE_MD_IO) {
+		ret = register_trace_drbd_md_io(probe_drbd_md_io);
+		WARN_ON(ret);
+	}
+	if (trace_mask & TRACE_EPOCH) {
+		ret = register_trace_drbd_epoch(probe_drbd_epoch);
+		WARN_ON(ret);
+	}
+	if (trace_mask & TRACE_NL) {
+		ret = register_trace_drbd_netlink(probe_drbd_netlink);
+		WARN_ON(ret);
+	}
+	if (trace_mask & TRACE_AL_EXT) {
+		ret = register_trace_drbd_actlog(probe_drbd_actlog);
+		WARN_ON(ret);
+	}
+	if (trace_mask & TRACE_RQ) {
+		ret = register_trace_drbd_bio(probe_drbd_bio);
+		WARN_ON(ret);
+	}
+	if (trace_mask & TRACE_INT_RQ) {
+		ret = register_trace_drbd_req(probe_drbd_req);
+		WARN_ON(ret);
+	}
+	if (trace_mask & TRACE_RESYNC) {
+		ret = register_trace__drbd_resync(probe_drbd_resync);
+		WARN_ON(ret);
+	}
+	return 0;
+}
+
+module_init(drbd_trace_init);
+
+static void __exit drbd_trace_exit(void)
+{
+	if (trace_mask & TRACE_UNPLUG)
+		unregister_trace_drbd_unplug(probe_drbd_unplug);
+	if (trace_mask & TRACE_UUID)
+		unregister_trace_drbd_uuid(probe_drbd_uuid);
+	if (trace_mask & TRACE_EE)
+		unregister_trace_drbd_ee(probe_drbd_ee);
+	if (trace_mask & TRACE_PACKET)
+		unregister_trace_drbd_packet(probe_drbd_packet);
+	if (trace_mask & TRACE_MD_IO)
+		unregister_trace_drbd_md_io(probe_drbd_md_io);
+	if (trace_mask & TRACE_EPOCH)
+		unregister_trace_drbd_epoch(probe_drbd_epoch);
+	if (trace_mask & TRACE_NL)
+		unregister_trace_drbd_netlink(probe_drbd_netlink);
+	if (trace_mask & TRACE_AL_EXT)
+		unregister_trace_drbd_actlog(probe_drbd_actlog);
+	if (trace_mask & TRACE_RQ)
+		unregister_trace_drbd_bio(probe_drbd_bio);
+	if (trace_mask & TRACE_INT_RQ)
+		unregister_trace_drbd_req(probe_drbd_req);
+	if (trace_mask & TRACE_RESYNC)
+		unregister_trace__drbd_resync(probe_drbd_resync);
+
+	tracepoint_synchronize_unregister();
+}
+
+module_exit(drbd_trace_exit);
diff --git a/drivers/block/drbd/drbd_tracing.h b/drivers/block/drbd/drbd_tracing.h
new file mode 100644
index 000000000000..c4531a137f65
--- /dev/null
+++ b/drivers/block/drbd/drbd_tracing.h
@@ -0,0 +1,87 @@
+/*
+   drbd_tracing.h
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#ifndef DRBD_TRACING_H
+#define DRBD_TRACING_H
+
+#include <linux/tracepoint.h>
+#include "drbd_int.h"
+#include "drbd_req.h"
+
+enum {
+	TRACE_LVL_ALWAYS = 0,
+	TRACE_LVL_SUMMARY,
+	TRACE_LVL_METRICS,
+	TRACE_LVL_ALL,
+	TRACE_LVL_MAX
+};
+
+DECLARE_TRACE(drbd_unplug,
+	TP_PROTO(struct drbd_conf *mdev, char* msg),
+	TP_ARGS(mdev, msg));
+
+DECLARE_TRACE(drbd_uuid,
+	TP_PROTO(struct drbd_conf *mdev, enum drbd_uuid_index index),
+	TP_ARGS(mdev, index));
+
+DECLARE_TRACE(drbd_ee,
+	TP_PROTO(struct drbd_conf *mdev, struct drbd_epoch_entry *e, char* msg),
+	TP_ARGS(mdev, e, msg));
+
+DECLARE_TRACE(drbd_md_io,
+	TP_PROTO(struct drbd_conf *mdev, int rw, struct drbd_backing_dev *bdev),
+	TP_ARGS(mdev, rw, bdev));
+
+DECLARE_TRACE(drbd_epoch,
+	TP_PROTO(struct drbd_conf *mdev, struct drbd_epoch *epoch, enum epoch_event ev),
+	TP_ARGS(mdev, epoch, ev));
+
+DECLARE_TRACE(drbd_netlink,
+	TP_PROTO(void *data, int is_req),
+	TP_ARGS(data, is_req));
+
+DECLARE_TRACE(drbd_actlog,
+	TP_PROTO(struct drbd_conf *mdev, sector_t sector, char* msg),
+	TP_ARGS(mdev, sector, msg));
+
+DECLARE_TRACE(drbd_bio,
+	TP_PROTO(struct drbd_conf *mdev, const char *pfx, struct bio *bio, int complete,
+		 struct drbd_request *r),
+	TP_ARGS(mdev, pfx, bio, complete, r));
+
+DECLARE_TRACE(drbd_req,
+	TP_PROTO(struct drbd_request *req, enum drbd_req_event what, char *msg),
+	      TP_ARGS(req, what, msg));
+
+DECLARE_TRACE(drbd_packet,
+	TP_PROTO(struct drbd_conf *mdev, struct socket *sock,
+		 int recv, union p_polymorph *p, char *file, int line),
+	TP_ARGS(mdev, sock, recv, p, file, line));
+
+DECLARE_TRACE(_drbd_resync,
+	TP_PROTO(struct drbd_conf *mdev, int level, const char *fmt, va_list args),
+	TP_ARGS(mdev, level, fmt, args));
+
+#endif
diff --git a/drivers/block/drbd/drbd_vli.h b/drivers/block/drbd/drbd_vli.h
new file mode 100644
index 000000000000..fc824006e721
--- /dev/null
+++ b/drivers/block/drbd/drbd_vli.h
@@ -0,0 +1,351 @@
+/*
+-*- linux-c -*-
+   drbd_receiver.c
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _DRBD_VLI_H
+#define _DRBD_VLI_H
+
+/*
+ * At a granularity of 4KiB storage represented per bit,
+ * and stroage sizes of several TiB,
+ * and possibly small-bandwidth replication,
+ * the bitmap transfer time can take much too long,
+ * if transmitted in plain text.
+ *
+ * We try to reduce the transfered bitmap information
+ * by encoding runlengths of bit polarity.
+ *
+ * We never actually need to encode a "zero" (runlengths are positive).
+ * But then we have to store the value of the first bit.
+ * The first bit of information thus shall encode if the first runlength
+ * gives the number of set or unset bits.
+ *
+ * We assume that large areas are either completely set or unset,
+ * which gives good compression with any runlength method,
+ * even when encoding the runlength as fixed size 32bit/64bit integers.
+ *
+ * Still, there may be areas where the polarity flips every few bits,
+ * and encoding the runlength sequence of those areas with fix size
+ * integers would be much worse than plaintext.
+ *
+ * We want to encode small runlength values with minimum code length,
+ * while still being able to encode a Huge run of all zeros.
+ *
+ * Thus we need a Variable Length Integer encoding, VLI.
+ *
+ * For some cases, we produce more code bits than plaintext input.
+ * We need to send incompressible chunks as plaintext, skip over them
+ * and then see if the next chunk compresses better.
+ *
+ * We don't care too much about "excellent" compression ratio for large
+ * runlengths (all set/all clear): whether we achieve a factor of 100
+ * or 1000 is not that much of an issue.
+ * We do not want to waste too much on short runlengths in the "noisy"
+ * parts of the bitmap, though.
+ *
+ * There are endless variants of VLI, we experimented with:
+ *  * simple byte-based
+ *  * various bit based with different code word length.
+ *
+ * To avoid yet an other configuration parameter (choice of bitmap compression
+ * algorithm) which was difficult to explain and tune, we just chose the one
+ * variant that turned out best in all test cases.
+ * Based on real world usage patterns, with device sizes ranging from a few GiB
+ * to several TiB, file server/mailserver/webserver/mysql/postgress,
+ * mostly idle to really busy, the all time winner (though sometimes only
+ * marginally better) is:
+ */
+
+/*
+ * encoding is "visualised" as
+ * __little endian__ bitstream, least significant bit first (left most)
+ *
+ * this particular encoding is chosen so that the prefix code
+ * starts as unary encoding the level, then modified so that
+ * 10 levels can be described in 8bit, with minimal overhead
+ * for the smaller levels.
+ *
+ * Number of data bits follow fibonacci sequence, with the exception of the
+ * last level (+1 data bit, so it makes 64bit total).  The only worse code when
+ * encoding bit polarity runlength is 1 plain bits => 2 code bits.
+prefix    data bits                                    max val  Nº data bits
+0 x                                                         0x2            1
+10 x                                                        0x4            1
+110 xx                                                      0x8            2
+1110 xxx                                                   0x10            3
+11110 xxx xx                                               0x30            5
+111110 xx xxxxxx                                          0x130            8
+11111100  xxxxxxxx xxxxx                                 0x2130           13
+11111110  xxxxxxxx xxxxxxxx xxxxx                      0x202130           21
+11111101  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xx   0x400202130           34
+11111111  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
+ * maximum encodable value: 0x100000400202130 == 2**56 + some */
+
+/* compression "table":
+ transmitted   x                                0.29
+ as plaintext x                                  ........................
+             x                                   ........................
+            x                                    ........................
+           x    0.59                         0.21........................
+          x      ........................................................
+         x       .. c ...................................................
+        x    0.44.. o ...................................................
+       x .......... d ...................................................
+      x  .......... e ...................................................
+     X.............   ...................................................
+    x.............. b ...................................................
+2.0x............... i ...................................................
+ #X................ t ...................................................
+ #................. s ...........................  plain bits  ..........
+-+-----------------------------------------------------------------------
+ 1             16              32                              64
+*/
+
+/* LEVEL: (total bits, prefix bits, prefix value),
+ * sorted ascending by number of total bits.
+ * The rest of the code table is calculated at compiletime from this. */
+
+/* fibonacci data 1, 1, ... */
+#define VLI_L_1_1() do { \
+	LEVEL( 2, 1, 0x00); \
+	LEVEL( 3, 2, 0x01); \
+	LEVEL( 5, 3, 0x03); \
+	LEVEL( 7, 4, 0x07); \
+	LEVEL(10, 5, 0x0f); \
+	LEVEL(14, 6, 0x1f); \
+	LEVEL(21, 8, 0x3f); \
+	LEVEL(29, 8, 0x7f); \
+	LEVEL(42, 8, 0xbf); \
+	LEVEL(64, 8, 0xff); \
+	} while (0)
+
+/* finds a suitable level to decode the least significant part of in.
+ * returns number of bits consumed.
+ *
+ * BUG() for bad input, as that would mean a buggy code table. */
+static inline int vli_decode_bits(u64 *out, const u64 in)
+{
+	u64 adj = 1;
+
+#define LEVEL(t,b,v)					\
+	do {						\
+		if ((in & ((1 << b) -1)) == v) {	\
+			*out = ((in & ((~0ULL) >> (64-t))) >> b) + adj;	\
+			return t;			\
+		}					\
+		adj += 1ULL << (t - b);			\
+	} while (0)
+
+	VLI_L_1_1();
+
+	/* NOT REACHED, if VLI_LEVELS code table is defined properly */
+	BUG();
+#undef LEVEL
+}
+
+/* return number of code bits needed,
+ * or negative error number */
+static inline int __vli_encode_bits(u64 *out, const u64 in)
+{
+	u64 max = 0;
+	u64 adj = 1;
+
+	if (in == 0)
+		return -EINVAL;
+
+#define LEVEL(t,b,v) do {		\
+		max += 1ULL << (t - b);	\
+		if (in <= max) {	\
+			if (out)	\
+				*out = ((in - adj) << b) | v;	\
+			return t;	\
+		}			\
+		adj = max + 1;		\
+	} while (0)
+
+	VLI_L_1_1();
+
+	return -EOVERFLOW;
+#undef LEVEL
+}
+
+#undef VLI_L_1_1
+
+/* code from here down is independend of actually used bit code */
+
+/*
+ * Code length is determined by some unique (e.g. unary) prefix.
+ * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
+ * not a byte stream.
+ */
+
+/* for the bitstream, we need a cursor */
+struct bitstream_cursor {
+	/* the current byte */
+	u8 *b;
+	/* the current bit within *b, nomalized: 0..7 */
+	unsigned int bit;
+};
+
+/* initialize cursor to point to first bit of stream */
+static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
+{
+	cur->b = s;
+	cur->bit = 0;
+}
+
+/* advance cursor by that many bits; maximum expected input value: 64,
+ * but depending on VLI implementation, it may be more. */
+static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
+{
+	bits += cur->bit;
+	cur->b = cur->b + (bits >> 3);
+	cur->bit = bits & 7;
+}
+
+/* the bitstream itself knows its length */
+struct bitstream {
+	struct bitstream_cursor cur;
+	unsigned char *buf;
+	size_t buf_len;		/* in bytes */
+
+	/* for input stream:
+	 * number of trailing 0 bits for padding
+	 * total number of valid bits in stream: buf_len * 8 - pad_bits */
+	unsigned int pad_bits;
+};
+
+static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
+{
+	bs->buf = s;
+	bs->buf_len = len;
+	bs->pad_bits = pad_bits;
+	bitstream_cursor_reset(&bs->cur, bs->buf);
+}
+
+static inline void bitstream_rewind(struct bitstream *bs)
+{
+	bitstream_cursor_reset(&bs->cur, bs->buf);
+	memset(bs->buf, 0, bs->buf_len);
+}
+
+/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
+ * Ignores "pad_bits".
+ * Returns zero if bits == 0 (nothing to do).
+ * Returns number of bits used if successful.
+ *
+ * If there is not enough room left in bitstream,
+ * leaves bitstream unchanged and returns -ENOBUFS.
+ */
+static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
+{
+	unsigned char *b = bs->cur.b;
+	unsigned int tmp;
+
+	if (bits == 0)
+		return 0;
+
+	if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
+		return -ENOBUFS;
+
+	/* paranoia: strip off hi bits; they should not be set anyways. */
+	if (bits < 64)
+		val &= ~0ULL >> (64 - bits);
+
+	*b++ |= (val & 0xff) << bs->cur.bit;
+
+	for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
+		*b++ |= (val >> tmp) & 0xff;
+
+	bitstream_cursor_advance(&bs->cur, bits);
+	return bits;
+}
+
+/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
+ *
+ * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
+ *
+ * If there are less than the requested number of valid bits left in the
+ * bitstream, still fetches all available bits.
+ *
+ * Returns number of actually fetched bits.
+ */
+static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
+{
+	u64 val;
+	unsigned int n;
+
+	if (bits > 64)
+		return -EINVAL;
+
+	if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
+		bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
+			- bs->cur.bit - bs->pad_bits;
+
+	if (bits == 0) {
+		*out = 0;
+		return 0;
+	}
+
+	/* get the high bits */
+	val = 0;
+	n = (bs->cur.bit + bits + 7) >> 3;
+	/* n may be at most 9, if cur.bit + bits > 64 */
+	/* which means this copies at most 8 byte */
+	if (n) {
+		memcpy(&val, bs->cur.b+1, n - 1);
+		val = le64_to_cpu(val) << (8 - bs->cur.bit);
+	}
+
+	/* we still need the low bits */
+	val |= bs->cur.b[0] >> bs->cur.bit;
+
+	/* and mask out bits we don't want */
+	val &= ~0ULL >> (64 - bits);
+
+	bitstream_cursor_advance(&bs->cur, bits);
+	*out = val;
+
+	return bits;
+}
+
+/* encodes @in as vli into @bs;
+
+ * return values
+ *  > 0: number of bits successfully stored in bitstream
+ * -ENOBUFS @bs is full
+ * -EINVAL input zero (invalid)
+ * -EOVERFLOW input too large for this vli code (invalid)
+ */
+static inline int vli_encode_bits(struct bitstream *bs, u64 in)
+{
+	u64 code = code;
+	int bits = __vli_encode_bits(&code, in);
+
+	if (bits <= 0)
+		return bits;
+
+	return bitstream_put_bits(bs, code, bits);
+}
+
+#endif
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
new file mode 100644
index 000000000000..212e9545e634
--- /dev/null
+++ b/drivers/block/drbd/drbd_worker.c
@@ -0,0 +1,1529 @@
+/*
+   drbd_worker.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/autoconf.h>
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/drbd.h>
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/wait.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/scatterlist.h>
+
+#include "drbd_int.h"
+#include "drbd_req.h"
+#include "drbd_tracing.h"
+
+#define SLEEP_TIME (HZ/10)
+
+static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
+
+
+
+/* defined here:
+   drbd_md_io_complete
+   drbd_endio_write_sec
+   drbd_endio_read_sec
+   drbd_endio_pri
+
+ * more endio handlers:
+   atodb_endio in drbd_actlog.c
+   drbd_bm_async_io_complete in drbd_bitmap.c
+
+ * For all these callbacks, note the following:
+ * The callbacks will be called in irq context by the IDE drivers,
+ * and in Softirqs/Tasklets/BH context by the SCSI drivers.
+ * Try to get the locking right :)
+ *
+ */
+
+
+/* About the global_state_lock
+   Each state transition on an device holds a read lock. In case we have
+   to evaluate the sync after dependencies, we grab a write lock, because
+   we need stable states on all devices for that.  */
+rwlock_t global_state_lock;
+
+/* used for synchronous meta data and bitmap IO
+ * submitted by drbd_md_sync_page_io()
+ */
+void drbd_md_io_complete(struct bio *bio, int error)
+{
+	struct drbd_md_io *md_io;
+
+	md_io = (struct drbd_md_io *)bio->bi_private;
+	md_io->error = error;
+
+	trace_drbd_bio(md_io->mdev, "Md", bio, 1, NULL);
+
+	complete(&md_io->event);
+}
+
+/* reads on behalf of the partner,
+ * "submitted" by the receiver
+ */
+void drbd_endio_read_sec(struct bio *bio, int error) __releases(local)
+{
+	unsigned long flags = 0;
+	struct drbd_epoch_entry *e = NULL;
+	struct drbd_conf *mdev;
+	int uptodate = bio_flagged(bio, BIO_UPTODATE);
+
+	e = bio->bi_private;
+	mdev = e->mdev;
+
+	if (error)
+		dev_warn(DEV, "read: error=%d s=%llus\n", error,
+				(unsigned long long)e->sector);
+	if (!error && !uptodate) {
+		dev_warn(DEV, "read: setting error to -EIO s=%llus\n",
+				(unsigned long long)e->sector);
+		/* strange behavior of some lower level drivers...
+		 * fail the request by clearing the uptodate flag,
+		 * but do not return any error?! */
+		error = -EIO;
+	}
+
+	D_ASSERT(e->block_id != ID_VACANT);
+
+	trace_drbd_bio(mdev, "Sec", bio, 1, NULL);
+
+	spin_lock_irqsave(&mdev->req_lock, flags);
+	mdev->read_cnt += e->size >> 9;
+	list_del(&e->w.list);
+	if (list_empty(&mdev->read_ee))
+		wake_up(&mdev->ee_wait);
+	spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+	drbd_chk_io_error(mdev, error, FALSE);
+	drbd_queue_work(&mdev->data.work, &e->w);
+	put_ldev(mdev);
+
+	trace_drbd_ee(mdev, e, "read completed");
+}
+
+/* writes on behalf of the partner, or resync writes,
+ * "submitted" by the receiver.
+ */
+void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
+{
+	unsigned long flags = 0;
+	struct drbd_epoch_entry *e = NULL;
+	struct drbd_conf *mdev;
+	sector_t e_sector;
+	int do_wake;
+	int is_syncer_req;
+	int do_al_complete_io;
+	int uptodate = bio_flagged(bio, BIO_UPTODATE);
+	int is_barrier = bio_rw_flagged(bio, BIO_RW_BARRIER);
+
+	e = bio->bi_private;
+	mdev = e->mdev;
+
+	if (error)
+		dev_warn(DEV, "write: error=%d s=%llus\n", error,
+				(unsigned long long)e->sector);
+	if (!error && !uptodate) {
+		dev_warn(DEV, "write: setting error to -EIO s=%llus\n",
+				(unsigned long long)e->sector);
+		/* strange behavior of some lower level drivers...
+		 * fail the request by clearing the uptodate flag,
+		 * but do not return any error?! */
+		error = -EIO;
+	}
+
+	/* error == -ENOTSUPP would be a better test,
+	 * alas it is not reliable */
+	if (error && is_barrier && e->flags & EE_IS_BARRIER) {
+		drbd_bump_write_ordering(mdev, WO_bdev_flush);
+		spin_lock_irqsave(&mdev->req_lock, flags);
+		list_del(&e->w.list);
+		e->w.cb = w_e_reissue;
+		/* put_ldev actually happens below, once we come here again. */
+		__release(local);
+		spin_unlock_irqrestore(&mdev->req_lock, flags);
+		drbd_queue_work(&mdev->data.work, &e->w);
+		return;
+	}
+
+	D_ASSERT(e->block_id != ID_VACANT);
+
+	trace_drbd_bio(mdev, "Sec", bio, 1, NULL);
+
+	spin_lock_irqsave(&mdev->req_lock, flags);
+	mdev->writ_cnt += e->size >> 9;
+	is_syncer_req = is_syncer_block_id(e->block_id);
+
+	/* after we moved e to done_ee,
+	 * we may no longer access it,
+	 * it may be freed/reused already!
+	 * (as soon as we release the req_lock) */
+	e_sector = e->sector;
+	do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
+
+	list_del(&e->w.list); /* has been on active_ee or sync_ee */
+	list_add_tail(&e->w.list, &mdev->done_ee);
+
+	trace_drbd_ee(mdev, e, "write completed");
+
+	/* No hlist_del_init(&e->colision) here, we did not send the Ack yet,
+	 * neither did we wake possibly waiting conflicting requests.
+	 * done from "drbd_process_done_ee" within the appropriate w.cb
+	 * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
+
+	do_wake = is_syncer_req
+		? list_empty(&mdev->sync_ee)
+		: list_empty(&mdev->active_ee);
+
+	if (error)
+		__drbd_chk_io_error(mdev, FALSE);
+	spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+	if (is_syncer_req)
+		drbd_rs_complete_io(mdev, e_sector);
+
+	if (do_wake)
+		wake_up(&mdev->ee_wait);
+
+	if (do_al_complete_io)
+		drbd_al_complete_io(mdev, e_sector);
+
+	wake_asender(mdev);
+	put_ldev(mdev);
+
+}
+
+/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
+ */
+void drbd_endio_pri(struct bio *bio, int error)
+{
+	unsigned long flags;
+	struct drbd_request *req = bio->bi_private;
+	struct drbd_conf *mdev = req->mdev;
+	struct bio_and_error m;
+	enum drbd_req_event what;
+	int uptodate = bio_flagged(bio, BIO_UPTODATE);
+
+	if (error)
+		dev_warn(DEV, "p %s: error=%d\n",
+			 bio_data_dir(bio) == WRITE ? "write" : "read", error);
+	if (!error && !uptodate) {
+		dev_warn(DEV, "p %s: setting error to -EIO\n",
+			 bio_data_dir(bio) == WRITE ? "write" : "read");
+		/* strange behavior of some lower level drivers...
+		 * fail the request by clearing the uptodate flag,
+		 * but do not return any error?! */
+		error = -EIO;
+	}
+
+	trace_drbd_bio(mdev, "Pri", bio, 1, NULL);
+
+	/* to avoid recursion in __req_mod */
+	if (unlikely(error)) {
+		what = (bio_data_dir(bio) == WRITE)
+			? write_completed_with_error
+			: (bio_rw(bio) == READA)
+			  ? read_completed_with_error
+			  : read_ahead_completed_with_error;
+	} else
+		what = completed_ok;
+
+	bio_put(req->private_bio);
+	req->private_bio = ERR_PTR(error);
+
+	spin_lock_irqsave(&mdev->req_lock, flags);
+	__req_mod(req, what, &m);
+	spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+	if (m.bio)
+		complete_master_bio(mdev, &m);
+}
+
+int w_io_error(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+
+	/* NOTE: mdev->ldev can be NULL by the time we get here! */
+	/* D_ASSERT(mdev->ldev->dc.on_io_error != EP_PASS_ON); */
+
+	/* the only way this callback is scheduled is from _req_may_be_done,
+	 * when it is done and had a local write error, see comments there */
+	drbd_req_free(req);
+
+	return TRUE;
+}
+
+int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+
+	/* We should not detach for read io-error,
+	 * but try to WRITE the P_DATA_REPLY to the failed location,
+	 * to give the disk the chance to relocate that block */
+
+	spin_lock_irq(&mdev->req_lock);
+	if (cancel ||
+	    mdev->state.conn < C_CONNECTED ||
+	    mdev->state.pdsk <= D_INCONSISTENT) {
+		_req_mod(req, send_canceled);
+		spin_unlock_irq(&mdev->req_lock);
+		dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n");
+		return 1;
+	}
+	spin_unlock_irq(&mdev->req_lock);
+
+	return w_send_read_req(mdev, w, 0);
+}
+
+int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	ERR_IF(cancel) return 1;
+	dev_err(DEV, "resync inactive, but callback triggered??\n");
+	return 1; /* Simply ignore this! */
+}
+
+void drbd_csum(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
+{
+	struct hash_desc desc;
+	struct scatterlist sg;
+	struct bio_vec *bvec;
+	int i;
+
+	desc.tfm = tfm;
+	desc.flags = 0;
+
+	sg_init_table(&sg, 1);
+	crypto_hash_init(&desc);
+
+	__bio_for_each_segment(bvec, bio, i, 0) {
+		sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
+		crypto_hash_update(&desc, &sg, sg.length);
+	}
+	crypto_hash_final(&desc, digest);
+}
+
+static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+	int digest_size;
+	void *digest;
+	int ok;
+
+	D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef);
+
+	if (unlikely(cancel)) {
+		drbd_free_ee(mdev, e);
+		return 1;
+	}
+
+	if (likely(drbd_bio_uptodate(e->private_bio))) {
+		digest_size = crypto_hash_digestsize(mdev->csums_tfm);
+		digest = kmalloc(digest_size, GFP_NOIO);
+		if (digest) {
+			drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest);
+
+			inc_rs_pending(mdev);
+			ok = drbd_send_drequest_csum(mdev,
+						     e->sector,
+						     e->size,
+						     digest,
+						     digest_size,
+						     P_CSUM_RS_REQUEST);
+			kfree(digest);
+		} else {
+			dev_err(DEV, "kmalloc() of digest failed.\n");
+			ok = 0;
+		}
+	} else
+		ok = 1;
+
+	drbd_free_ee(mdev, e);
+
+	if (unlikely(!ok))
+		dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
+	return ok;
+}
+
+#define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)
+
+static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
+{
+	struct drbd_epoch_entry *e;
+
+	if (!get_ldev(mdev))
+		return 0;
+
+	/* GFP_TRY, because if there is no memory available right now, this may
+	 * be rescheduled for later. It is "only" background resync, after all. */
+	e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
+	if (!e) {
+		put_ldev(mdev);
+		return 2;
+	}
+
+	spin_lock_irq(&mdev->req_lock);
+	list_add(&e->w.list, &mdev->read_ee);
+	spin_unlock_irq(&mdev->req_lock);
+
+	e->private_bio->bi_end_io = drbd_endio_read_sec;
+	e->private_bio->bi_rw = READ;
+	e->w.cb = w_e_send_csum;
+
+	mdev->read_cnt += size >> 9;
+	drbd_generic_make_request(mdev, DRBD_FAULT_RS_RD, e->private_bio);
+
+	return 1;
+}
+
+void resync_timer_fn(unsigned long data)
+{
+	unsigned long flags;
+	struct drbd_conf *mdev = (struct drbd_conf *) data;
+	int queue;
+
+	spin_lock_irqsave(&mdev->req_lock, flags);
+
+	if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) {
+		queue = 1;
+		if (mdev->state.conn == C_VERIFY_S)
+			mdev->resync_work.cb = w_make_ov_request;
+		else
+			mdev->resync_work.cb = w_make_resync_request;
+	} else {
+		queue = 0;
+		mdev->resync_work.cb = w_resync_inactive;
+	}
+
+	spin_unlock_irqrestore(&mdev->req_lock, flags);
+
+	/* harmless race: list_empty outside data.work.q_lock */
+	if (list_empty(&mdev->resync_work.list) && queue)
+		drbd_queue_work(&mdev->data.work, &mdev->resync_work);
+}
+
+int w_make_resync_request(struct drbd_conf *mdev,
+		struct drbd_work *w, int cancel)
+{
+	unsigned long bit;
+	sector_t sector;
+	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
+	int max_segment_size = queue_max_segment_size(mdev->rq_queue);
+	int number, i, size, pe, mx;
+	int align, queued, sndbuf;
+
+	if (unlikely(cancel))
+		return 1;
+
+	if (unlikely(mdev->state.conn < C_CONNECTED)) {
+		dev_err(DEV, "Confused in w_make_resync_request()! cstate < Connected");
+		return 0;
+	}
+
+	if (mdev->state.conn != C_SYNC_TARGET)
+		dev_err(DEV, "%s in w_make_resync_request\n",
+			drbd_conn_str(mdev->state.conn));
+
+	if (!get_ldev(mdev)) {
+		/* Since we only need to access mdev->rsync a
+		   get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
+		   to continue resync with a broken disk makes no sense at
+		   all */
+		dev_err(DEV, "Disk broke down during resync!\n");
+		mdev->resync_work.cb = w_resync_inactive;
+		return 1;
+	}
+
+	number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
+	pe = atomic_read(&mdev->rs_pending_cnt);
+
+	mutex_lock(&mdev->data.mutex);
+	if (mdev->data.socket)
+		mx = mdev->data.socket->sk->sk_rcvbuf / sizeof(struct p_block_req);
+	else
+		mx = 1;
+	mutex_unlock(&mdev->data.mutex);
+
+	/* For resync rates >160MB/sec, allow more pending RS requests */
+	if (number > mx)
+		mx = number;
+
+	/* Limit the number of pending RS requests to no more than the peer's receive buffer */
+	if ((pe + number) > mx) {
+		number = mx - pe;
+	}
+
+	for (i = 0; i < number; i++) {
+		/* Stop generating RS requests, when half of the send buffer is filled */
+		mutex_lock(&mdev->data.mutex);
+		if (mdev->data.socket) {
+			queued = mdev->data.socket->sk->sk_wmem_queued;
+			sndbuf = mdev->data.socket->sk->sk_sndbuf;
+		} else {
+			queued = 1;
+			sndbuf = 0;
+		}
+		mutex_unlock(&mdev->data.mutex);
+		if (queued > sndbuf / 2)
+			goto requeue;
+
+next_sector:
+		size = BM_BLOCK_SIZE;
+		bit  = drbd_bm_find_next(mdev, mdev->bm_resync_fo);
+
+		if (bit == -1UL) {
+			mdev->bm_resync_fo = drbd_bm_bits(mdev);
+			mdev->resync_work.cb = w_resync_inactive;
+			put_ldev(mdev);
+			return 1;
+		}
+
+		sector = BM_BIT_TO_SECT(bit);
+
+		if (drbd_try_rs_begin_io(mdev, sector)) {
+			mdev->bm_resync_fo = bit;
+			goto requeue;
+		}
+		mdev->bm_resync_fo = bit + 1;
+
+		if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) {
+			drbd_rs_complete_io(mdev, sector);
+			goto next_sector;
+		}
+
+#if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE
+		/* try to find some adjacent bits.
+		 * we stop if we have already the maximum req size.
+		 *
+		 * Additionally always align bigger requests, in order to
+		 * be prepared for all stripe sizes of software RAIDs.
+		 *
+		 * we _do_ care about the agreed-upon q->max_segment_size
+		 * here, as splitting up the requests on the other side is more
+		 * difficult.  the consequence is, that on lvm and md and other
+		 * "indirect" devices, this is dead code, since
+		 * q->max_segment_size will be PAGE_SIZE.
+		 */
+		align = 1;
+		for (;;) {
+			if (size + BM_BLOCK_SIZE > max_segment_size)
+				break;
+
+			/* Be always aligned */
+			if (sector & ((1<<(align+3))-1))
+				break;
+
+			/* do not cross extent boundaries */
+			if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
+				break;
+			/* now, is it actually dirty, after all?
+			 * caution, drbd_bm_test_bit is tri-state for some
+			 * obscure reason; ( b == 0 ) would get the out-of-band
+			 * only accidentally right because of the "oddly sized"
+			 * adjustment below */
+			if (drbd_bm_test_bit(mdev, bit+1) != 1)
+				break;
+			bit++;
+			size += BM_BLOCK_SIZE;
+			if ((BM_BLOCK_SIZE << align) <= size)
+				align++;
+			i++;
+		}
+		/* if we merged some,
+		 * reset the offset to start the next drbd_bm_find_next from */
+		if (size > BM_BLOCK_SIZE)
+			mdev->bm_resync_fo = bit + 1;
+#endif
+
+		/* adjust very last sectors, in case we are oddly sized */
+		if (sector + (size>>9) > capacity)
+			size = (capacity-sector)<<9;
+		if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
+			switch (read_for_csum(mdev, sector, size)) {
+			case 0: /* Disk failure*/
+				put_ldev(mdev);
+				return 0;
+			case 2: /* Allocation failed */
+				drbd_rs_complete_io(mdev, sector);
+				mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
+				goto requeue;
+			/* case 1: everything ok */
+			}
+		} else {
+			inc_rs_pending(mdev);
+			if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
+					       sector, size, ID_SYNCER)) {
+				dev_err(DEV, "drbd_send_drequest() failed, aborting...\n");
+				dec_rs_pending(mdev);
+				put_ldev(mdev);
+				return 0;
+			}
+		}
+	}
+
+	if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) {
+		/* last syncer _request_ was sent,
+		 * but the P_RS_DATA_REPLY not yet received.  sync will end (and
+		 * next sync group will resume), as soon as we receive the last
+		 * resync data block, and the last bit is cleared.
+		 * until then resync "work" is "inactive" ...
+		 */
+		mdev->resync_work.cb = w_resync_inactive;
+		put_ldev(mdev);
+		return 1;
+	}
+
+ requeue:
+	mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
+	put_ldev(mdev);
+	return 1;
+}
+
+static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	int number, i, size;
+	sector_t sector;
+	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
+
+	if (unlikely(cancel))
+		return 1;
+
+	if (unlikely(mdev->state.conn < C_CONNECTED)) {
+		dev_err(DEV, "Confused in w_make_ov_request()! cstate < Connected");
+		return 0;
+	}
+
+	number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
+	if (atomic_read(&mdev->rs_pending_cnt) > number)
+		goto requeue;
+
+	number -= atomic_read(&mdev->rs_pending_cnt);
+
+	sector = mdev->ov_position;
+	for (i = 0; i < number; i++) {
+		if (sector >= capacity) {
+			mdev->resync_work.cb = w_resync_inactive;
+			return 1;
+		}
+
+		size = BM_BLOCK_SIZE;
+
+		if (drbd_try_rs_begin_io(mdev, sector)) {
+			mdev->ov_position = sector;
+			goto requeue;
+		}
+
+		if (sector + (size>>9) > capacity)
+			size = (capacity-sector)<<9;
+
+		inc_rs_pending(mdev);
+		if (!drbd_send_ov_request(mdev, sector, size)) {
+			dec_rs_pending(mdev);
+			return 0;
+		}
+		sector += BM_SECT_PER_BIT;
+	}
+	mdev->ov_position = sector;
+
+ requeue:
+	mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
+	return 1;
+}
+
+
+int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	kfree(w);
+	ov_oos_print(mdev);
+	drbd_resync_finished(mdev);
+
+	return 1;
+}
+
+static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	kfree(w);
+
+	drbd_resync_finished(mdev);
+
+	return 1;
+}
+
+int drbd_resync_finished(struct drbd_conf *mdev)
+{
+	unsigned long db, dt, dbdt;
+	unsigned long n_oos;
+	union drbd_state os, ns;
+	struct drbd_work *w;
+	char *khelper_cmd = NULL;
+
+	/* Remove all elements from the resync LRU. Since future actions
+	 * might set bits in the (main) bitmap, then the entries in the
+	 * resync LRU would be wrong. */
+	if (drbd_rs_del_all(mdev)) {
+		/* In case this is not possible now, most probably because
+		 * there are P_RS_DATA_REPLY Packets lingering on the worker's
+		 * queue (or even the read operations for those packets
+		 * is not finished by now).   Retry in 100ms. */
+
+		drbd_kick_lo(mdev);
+		__set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(HZ / 10);
+		w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
+		if (w) {
+			w->cb = w_resync_finished;
+			drbd_queue_work(&mdev->data.work, w);
+			return 1;
+		}
+		dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
+	}
+
+	dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
+	if (dt <= 0)
+		dt = 1;
+	db = mdev->rs_total;
+	dbdt = Bit2KB(db/dt);
+	mdev->rs_paused /= HZ;
+
+	if (!get_ldev(mdev))
+		goto out;
+
+	spin_lock_irq(&mdev->req_lock);
+	os = mdev->state;
+
+	/* This protects us against multiple calls (that can happen in the presence
+	   of application IO), and against connectivity loss just before we arrive here. */
+	if (os.conn <= C_CONNECTED)
+		goto out_unlock;
+
+	ns = os;
+	ns.conn = C_CONNECTED;
+
+	dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
+	     (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) ?
+	     "Online verify " : "Resync",
+	     dt + mdev->rs_paused, mdev->rs_paused, dbdt);
+
+	n_oos = drbd_bm_total_weight(mdev);
+
+	if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
+		if (n_oos) {
+			dev_alert(DEV, "Online verify found %lu %dk block out of sync!\n",
+			      n_oos, Bit2KB(1));
+			khelper_cmd = "out-of-sync";
+		}
+	} else {
+		D_ASSERT((n_oos - mdev->rs_failed) == 0);
+
+		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
+			khelper_cmd = "after-resync-target";
+
+		if (mdev->csums_tfm && mdev->rs_total) {
+			const unsigned long s = mdev->rs_same_csum;
+			const unsigned long t = mdev->rs_total;
+			const int ratio =
+				(t == 0)     ? 0 :
+			(t < 100000) ? ((s*100)/t) : (s/(t/100));
+			dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; "
+			     "transferred %luK total %luK\n",
+			     ratio,
+			     Bit2KB(mdev->rs_same_csum),
+			     Bit2KB(mdev->rs_total - mdev->rs_same_csum),
+			     Bit2KB(mdev->rs_total));
+		}
+	}
+
+	if (mdev->rs_failed) {
+		dev_info(DEV, "            %lu failed blocks\n", mdev->rs_failed);
+
+		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
+			ns.disk = D_INCONSISTENT;
+			ns.pdsk = D_UP_TO_DATE;
+		} else {
+			ns.disk = D_UP_TO_DATE;
+			ns.pdsk = D_INCONSISTENT;
+		}
+	} else {
+		ns.disk = D_UP_TO_DATE;
+		ns.pdsk = D_UP_TO_DATE;
+
+		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
+			if (mdev->p_uuid) {
+				int i;
+				for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
+					_drbd_uuid_set(mdev, i, mdev->p_uuid[i]);
+				drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_CURRENT]);
+				_drbd_uuid_set(mdev, UI_CURRENT, mdev->p_uuid[UI_CURRENT]);
+			} else {
+				dev_err(DEV, "mdev->p_uuid is NULL! BUG\n");
+			}
+		}
+
+		drbd_uuid_set_bm(mdev, 0UL);
+
+		if (mdev->p_uuid) {
+			/* Now the two UUID sets are equal, update what we
+			 * know of the peer. */
+			int i;
+			for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
+				mdev->p_uuid[i] = mdev->ldev->md.uuid[i];
+		}
+	}
+
+	_drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
+out_unlock:
+	spin_unlock_irq(&mdev->req_lock);
+	put_ldev(mdev);
+out:
+	mdev->rs_total  = 0;
+	mdev->rs_failed = 0;
+	mdev->rs_paused = 0;
+	mdev->ov_start_sector = 0;
+
+	if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
+		dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n");
+		drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
+	}
+
+	if (khelper_cmd)
+		drbd_khelper(mdev, khelper_cmd);
+
+	return 1;
+}
+
+/* helper */
+static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
+{
+	if (drbd_bio_has_active_page(e->private_bio)) {
+		/* This might happen if sendpage() has not finished */
+		spin_lock_irq(&mdev->req_lock);
+		list_add_tail(&e->w.list, &mdev->net_ee);
+		spin_unlock_irq(&mdev->req_lock);
+	} else
+		drbd_free_ee(mdev, e);
+}
+
+/**
+ * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
+ * @mdev:	DRBD device.
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways
+ */
+int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+	int ok;
+
+	if (unlikely(cancel)) {
+		drbd_free_ee(mdev, e);
+		dec_unacked(mdev);
+		return 1;
+	}
+
+	if (likely(drbd_bio_uptodate(e->private_bio))) {
+		ok = drbd_send_block(mdev, P_DATA_REPLY, e);
+	} else {
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_err(DEV, "Sending NegDReply. sector=%llus.\n",
+			    (unsigned long long)e->sector);
+
+		ok = drbd_send_ack(mdev, P_NEG_DREPLY, e);
+	}
+
+	dec_unacked(mdev);
+
+	move_to_net_ee_or_free(mdev, e);
+
+	if (unlikely(!ok))
+		dev_err(DEV, "drbd_send_block() failed\n");
+	return ok;
+}
+
+/**
+ * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS
+ * @mdev:	DRBD device.
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways
+ */
+int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+	int ok;
+
+	if (unlikely(cancel)) {
+		drbd_free_ee(mdev, e);
+		dec_unacked(mdev);
+		return 1;
+	}
+
+	if (get_ldev_if_state(mdev, D_FAILED)) {
+		drbd_rs_complete_io(mdev, e->sector);
+		put_ldev(mdev);
+	}
+
+	if (likely(drbd_bio_uptodate(e->private_bio))) {
+		if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
+			inc_rs_pending(mdev);
+			ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
+		} else {
+			if (__ratelimit(&drbd_ratelimit_state))
+				dev_err(DEV, "Not sending RSDataReply, "
+				    "partner DISKLESS!\n");
+			ok = 1;
+		}
+	} else {
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_err(DEV, "Sending NegRSDReply. sector %llus.\n",
+			    (unsigned long long)e->sector);
+
+		ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
+
+		/* update resync data with failure */
+		drbd_rs_failed_io(mdev, e->sector, e->size);
+	}
+
+	dec_unacked(mdev);
+
+	move_to_net_ee_or_free(mdev, e);
+
+	if (unlikely(!ok))
+		dev_err(DEV, "drbd_send_block() failed\n");
+	return ok;
+}
+
+int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+	struct digest_info *di;
+	int digest_size;
+	void *digest = NULL;
+	int ok, eq = 0;
+
+	if (unlikely(cancel)) {
+		drbd_free_ee(mdev, e);
+		dec_unacked(mdev);
+		return 1;
+	}
+
+	drbd_rs_complete_io(mdev, e->sector);
+
+	di = (struct digest_info *)(unsigned long)e->block_id;
+
+	if (likely(drbd_bio_uptodate(e->private_bio))) {
+		/* quick hack to try to avoid a race against reconfiguration.
+		 * a real fix would be much more involved,
+		 * introducing more locking mechanisms */
+		if (mdev->csums_tfm) {
+			digest_size = crypto_hash_digestsize(mdev->csums_tfm);
+			D_ASSERT(digest_size == di->digest_size);
+			digest = kmalloc(digest_size, GFP_NOIO);
+		}
+		if (digest) {
+			drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest);
+			eq = !memcmp(digest, di->digest, digest_size);
+			kfree(digest);
+		}
+
+		if (eq) {
+			drbd_set_in_sync(mdev, e->sector, e->size);
+			mdev->rs_same_csum++;
+			ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
+		} else {
+			inc_rs_pending(mdev);
+			e->block_id = ID_SYNCER;
+			ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
+		}
+	} else {
+		ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
+	}
+
+	dec_unacked(mdev);
+
+	kfree(di);
+
+	move_to_net_ee_or_free(mdev, e);
+
+	if (unlikely(!ok))
+		dev_err(DEV, "drbd_send_block/ack() failed\n");
+	return ok;
+}
+
+int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+	int digest_size;
+	void *digest;
+	int ok = 1;
+
+	if (unlikely(cancel))
+		goto out;
+
+	if (unlikely(!drbd_bio_uptodate(e->private_bio)))
+		goto out;
+
+	digest_size = crypto_hash_digestsize(mdev->verify_tfm);
+	/* FIXME if this allocation fails, online verify will not terminate! */
+	digest = kmalloc(digest_size, GFP_NOIO);
+	if (digest) {
+		drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest);
+		inc_rs_pending(mdev);
+		ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
+					     digest, digest_size, P_OV_REPLY);
+		if (!ok)
+			dec_rs_pending(mdev);
+		kfree(digest);
+	}
+
+out:
+	drbd_free_ee(mdev, e);
+
+	dec_unacked(mdev);
+
+	return ok;
+}
+
+void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)
+{
+	if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) {
+		mdev->ov_last_oos_size += size>>9;
+	} else {
+		mdev->ov_last_oos_start = sector;
+		mdev->ov_last_oos_size = size>>9;
+	}
+	drbd_set_out_of_sync(mdev, sector, size);
+	set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
+}
+
+int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+	struct digest_info *di;
+	int digest_size;
+	void *digest;
+	int ok, eq = 0;
+
+	if (unlikely(cancel)) {
+		drbd_free_ee(mdev, e);
+		dec_unacked(mdev);
+		return 1;
+	}
+
+	/* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
+	 * the resync lru has been cleaned up already */
+	drbd_rs_complete_io(mdev, e->sector);
+
+	di = (struct digest_info *)(unsigned long)e->block_id;
+
+	if (likely(drbd_bio_uptodate(e->private_bio))) {
+		digest_size = crypto_hash_digestsize(mdev->verify_tfm);
+		digest = kmalloc(digest_size, GFP_NOIO);
+		if (digest) {
+			drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest);
+
+			D_ASSERT(digest_size == di->digest_size);
+			eq = !memcmp(digest, di->digest, digest_size);
+			kfree(digest);
+		}
+	} else {
+		ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
+	}
+
+	dec_unacked(mdev);
+
+	kfree(di);
+
+	if (!eq)
+		drbd_ov_oos_found(mdev, e->sector, e->size);
+	else
+		ov_oos_print(mdev);
+
+	ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size,
+			      eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
+
+	drbd_free_ee(mdev, e);
+
+	if (--mdev->ov_left == 0) {
+		ov_oos_print(mdev);
+		drbd_resync_finished(mdev);
+	}
+
+	return ok;
+}
+
+int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w);
+	complete(&b->done);
+	return 1;
+}
+
+int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w);
+	struct p_barrier *p = &mdev->data.sbuf.barrier;
+	int ok = 1;
+
+	/* really avoid racing with tl_clear.  w.cb may have been referenced
+	 * just before it was reassigned and re-queued, so double check that.
+	 * actually, this race was harmless, since we only try to send the
+	 * barrier packet here, and otherwise do nothing with the object.
+	 * but compare with the head of w_clear_epoch */
+	spin_lock_irq(&mdev->req_lock);
+	if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED)
+		cancel = 1;
+	spin_unlock_irq(&mdev->req_lock);
+	if (cancel)
+		return 1;
+
+	if (!drbd_get_data_sock(mdev))
+		return 0;
+	p->barrier = b->br_number;
+	/* inc_ap_pending was done where this was queued.
+	 * dec_ap_pending will be done in got_BarrierAck
+	 * or (on connection loss) in w_clear_epoch.  */
+	ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
+				(struct p_header *)p, sizeof(*p), 0);
+	drbd_put_data_sock(mdev);
+
+	return ok;
+}
+
+int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	if (cancel)
+		return 1;
+	return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE);
+}
+
+/**
+ * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
+ * @mdev:	DRBD device.
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways
+ */
+int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+	int ok;
+
+	if (unlikely(cancel)) {
+		req_mod(req, send_canceled);
+		return 1;
+	}
+
+	ok = drbd_send_dblock(mdev, req);
+	req_mod(req, ok ? handed_over_to_network : send_failed);
+
+	return ok;
+}
+
+/**
+ * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
+ * @mdev:	DRBD device.
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways
+ */
+int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+	int ok;
+
+	if (unlikely(cancel)) {
+		req_mod(req, send_canceled);
+		return 1;
+	}
+
+	ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size,
+				(unsigned long)req);
+
+	if (!ok) {
+		/* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send();
+		 * so this is probably redundant */
+		if (mdev->state.conn >= C_CONNECTED)
+			drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
+	}
+	req_mod(req, ok ? handed_over_to_network : send_failed);
+
+	return ok;
+}
+
+static int _drbd_may_sync_now(struct drbd_conf *mdev)
+{
+	struct drbd_conf *odev = mdev;
+
+	while (1) {
+		if (odev->sync_conf.after == -1)
+			return 1;
+		odev = minor_to_mdev(odev->sync_conf.after);
+		ERR_IF(!odev) return 1;
+		if ((odev->state.conn >= C_SYNC_SOURCE &&
+		     odev->state.conn <= C_PAUSED_SYNC_T) ||
+		    odev->state.aftr_isp || odev->state.peer_isp ||
+		    odev->state.user_isp)
+			return 0;
+	}
+}
+
+/**
+ * _drbd_pause_after() - Pause resync on all devices that may not resync now
+ * @mdev:	DRBD device.
+ *
+ * Called from process context only (admin command and after_state_ch).
+ */
+static int _drbd_pause_after(struct drbd_conf *mdev)
+{
+	struct drbd_conf *odev;
+	int i, rv = 0;
+
+	for (i = 0; i < minor_count; i++) {
+		odev = minor_to_mdev(i);
+		if (!odev)
+			continue;
+		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
+			continue;
+		if (!_drbd_may_sync_now(odev))
+			rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
+			       != SS_NOTHING_TO_DO);
+	}
+
+	return rv;
+}
+
+/**
+ * _drbd_resume_next() - Resume resync on all devices that may resync now
+ * @mdev:	DRBD device.
+ *
+ * Called from process context only (admin command and worker).
+ */
+static int _drbd_resume_next(struct drbd_conf *mdev)
+{
+	struct drbd_conf *odev;
+	int i, rv = 0;
+
+	for (i = 0; i < minor_count; i++) {
+		odev = minor_to_mdev(i);
+		if (!odev)
+			continue;
+		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
+			continue;
+		if (odev->state.aftr_isp) {
+			if (_drbd_may_sync_now(odev))
+				rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
+							CS_HARD, NULL)
+				       != SS_NOTHING_TO_DO) ;
+		}
+	}
+	return rv;
+}
+
+void resume_next_sg(struct drbd_conf *mdev)
+{
+	write_lock_irq(&global_state_lock);
+	_drbd_resume_next(mdev);
+	write_unlock_irq(&global_state_lock);
+}
+
+void suspend_other_sg(struct drbd_conf *mdev)
+{
+	write_lock_irq(&global_state_lock);
+	_drbd_pause_after(mdev);
+	write_unlock_irq(&global_state_lock);
+}
+
+static int sync_after_error(struct drbd_conf *mdev, int o_minor)
+{
+	struct drbd_conf *odev;
+
+	if (o_minor == -1)
+		return NO_ERROR;
+	if (o_minor < -1 || minor_to_mdev(o_minor) == NULL)
+		return ERR_SYNC_AFTER;
+
+	/* check for loops */
+	odev = minor_to_mdev(o_minor);
+	while (1) {
+		if (odev == mdev)
+			return ERR_SYNC_AFTER_CYCLE;
+
+		/* dependency chain ends here, no cycles. */
+		if (odev->sync_conf.after == -1)
+			return NO_ERROR;
+
+		/* follow the dependency chain */
+		odev = minor_to_mdev(odev->sync_conf.after);
+	}
+}
+
+int drbd_alter_sa(struct drbd_conf *mdev, int na)
+{
+	int changes;
+	int retcode;
+
+	write_lock_irq(&global_state_lock);
+	retcode = sync_after_error(mdev, na);
+	if (retcode == NO_ERROR) {
+		mdev->sync_conf.after = na;
+		do {
+			changes  = _drbd_pause_after(mdev);
+			changes |= _drbd_resume_next(mdev);
+		} while (changes);
+	}
+	write_unlock_irq(&global_state_lock);
+	return retcode;
+}
+
+/**
+ * drbd_start_resync() - Start the resync process
+ * @mdev:	DRBD device.
+ * @side:	Either C_SYNC_SOURCE or C_SYNC_TARGET
+ *
+ * This function might bring you directly into one of the
+ * C_PAUSED_SYNC_* states.
+ */
+void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
+{
+	union drbd_state ns;
+	int r;
+
+	if (mdev->state.conn >= C_SYNC_SOURCE) {
+		dev_err(DEV, "Resync already running!\n");
+		return;
+	}
+
+	trace_drbd_resync(mdev, TRACE_LVL_SUMMARY, "Resync starting: side=%s\n",
+			  side == C_SYNC_TARGET ? "SyncTarget" : "SyncSource");
+
+	/* In case a previous resync run was aborted by an IO error/detach on the peer. */
+	drbd_rs_cancel_all(mdev);
+
+	if (side == C_SYNC_TARGET) {
+		/* Since application IO was locked out during C_WF_BITMAP_T and
+		   C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
+		   we check that we might make the data inconsistent. */
+		r = drbd_khelper(mdev, "before-resync-target");
+		r = (r >> 8) & 0xff;
+		if (r > 0) {
+			dev_info(DEV, "before-resync-target handler returned %d, "
+			     "dropping connection.\n", r);
+			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+			return;
+		}
+	}
+
+	drbd_state_lock(mdev);
+
+	if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
+		drbd_state_unlock(mdev);
+		return;
+	}
+
+	if (side == C_SYNC_TARGET) {
+		mdev->bm_resync_fo = 0;
+	} else /* side == C_SYNC_SOURCE */ {
+		u64 uuid;
+
+		get_random_bytes(&uuid, sizeof(u64));
+		drbd_uuid_set(mdev, UI_BITMAP, uuid);
+		drbd_send_sync_uuid(mdev, uuid);
+
+		D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
+	}
+
+	write_lock_irq(&global_state_lock);
+	ns = mdev->state;
+
+	ns.aftr_isp = !_drbd_may_sync_now(mdev);
+
+	ns.conn = side;
+
+	if (side == C_SYNC_TARGET)
+		ns.disk = D_INCONSISTENT;
+	else /* side == C_SYNC_SOURCE */
+		ns.pdsk = D_INCONSISTENT;
+
+	r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
+	ns = mdev->state;
+
+	if (ns.conn < C_CONNECTED)
+		r = SS_UNKNOWN_ERROR;
+
+	if (r == SS_SUCCESS) {
+		mdev->rs_total     =
+		mdev->rs_mark_left = drbd_bm_total_weight(mdev);
+		mdev->rs_failed    = 0;
+		mdev->rs_paused    = 0;
+		mdev->rs_start     =
+		mdev->rs_mark_time = jiffies;
+		mdev->rs_same_csum = 0;
+		_drbd_pause_after(mdev);
+	}
+	write_unlock_irq(&global_state_lock);
+	drbd_state_unlock(mdev);
+	put_ldev(mdev);
+
+	if (r == SS_SUCCESS) {
+		dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
+		     drbd_conn_str(ns.conn),
+		     (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
+		     (unsigned long) mdev->rs_total);
+
+		if (mdev->rs_total == 0) {
+			/* Peer still reachable? Beware of failing before-resync-target handlers! */
+			request_ping(mdev);
+			__set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(mdev->net_conf->ping_timeo*HZ/9); /* 9 instead 10 */
+			drbd_resync_finished(mdev);
+			return;
+		}
+
+		/* ns.conn may already be != mdev->state.conn,
+		 * we may have been paused in between, or become paused until
+		 * the timer triggers.
+		 * No matter, that is handled in resync_timer_fn() */
+		if (ns.conn == C_SYNC_TARGET)
+			mod_timer(&mdev->resync_timer, jiffies);
+
+		drbd_md_sync(mdev);
+	}
+}
+
+int drbd_worker(struct drbd_thread *thi)
+{
+	struct drbd_conf *mdev = thi->mdev;
+	struct drbd_work *w = NULL;
+	LIST_HEAD(work_list);
+	int intr = 0, i;
+
+	sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev));
+
+	while (get_t_state(thi) == Running) {
+		drbd_thread_current_set_cpu(mdev);
+
+		if (down_trylock(&mdev->data.work.s)) {
+			mutex_lock(&mdev->data.mutex);
+			if (mdev->data.socket && !mdev->net_conf->no_cork)
+				drbd_tcp_uncork(mdev->data.socket);
+			mutex_unlock(&mdev->data.mutex);
+
+			intr = down_interruptible(&mdev->data.work.s);
+
+			mutex_lock(&mdev->data.mutex);
+			if (mdev->data.socket  && !mdev->net_conf->no_cork)
+				drbd_tcp_cork(mdev->data.socket);
+			mutex_unlock(&mdev->data.mutex);
+		}
+
+		if (intr) {
+			D_ASSERT(intr == -EINTR);
+			flush_signals(current);
+			ERR_IF (get_t_state(thi) == Running)
+				continue;
+			break;
+		}
+
+		if (get_t_state(thi) != Running)
+			break;
+		/* With this break, we have done a down() but not consumed
+		   the entry from the list. The cleanup code takes care of
+		   this...   */
+
+		w = NULL;
+		spin_lock_irq(&mdev->data.work.q_lock);
+		ERR_IF(list_empty(&mdev->data.work.q)) {
+			/* something terribly wrong in our logic.
+			 * we were able to down() the semaphore,
+			 * but the list is empty... doh.
+			 *
+			 * what is the best thing to do now?
+			 * try again from scratch, restarting the receiver,
+			 * asender, whatnot? could break even more ugly,
+			 * e.g. when we are primary, but no good local data.
+			 *
+			 * I'll try to get away just starting over this loop.
+			 */
+			spin_unlock_irq(&mdev->data.work.q_lock);
+			continue;
+		}
+		w = list_entry(mdev->data.work.q.next, struct drbd_work, list);
+		list_del_init(&w->list);
+		spin_unlock_irq(&mdev->data.work.q_lock);
+
+		if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) {
+			/* dev_warn(DEV, "worker: a callback failed! \n"); */
+			if (mdev->state.conn >= C_CONNECTED)
+				drbd_force_state(mdev,
+						NS(conn, C_NETWORK_FAILURE));
+		}
+	}
+	D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags));
+	D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags));
+
+	spin_lock_irq(&mdev->data.work.q_lock);
+	i = 0;
+	while (!list_empty(&mdev->data.work.q)) {
+		list_splice_init(&mdev->data.work.q, &work_list);
+		spin_unlock_irq(&mdev->data.work.q_lock);
+
+		while (!list_empty(&work_list)) {
+			w = list_entry(work_list.next, struct drbd_work, list);
+			list_del_init(&w->list);
+			w->cb(mdev, w, 1);
+			i++; /* dead debugging code */
+		}
+
+		spin_lock_irq(&mdev->data.work.q_lock);
+	}
+	sema_init(&mdev->data.work.s, 0);
+	/* DANGEROUS race: if someone did queue his work within the spinlock,
+	 * but up() ed outside the spinlock, we could get an up() on the
+	 * semaphore without corresponding list entry.
+	 * So don't do that.
+	 */
+	spin_unlock_irq(&mdev->data.work.q_lock);
+
+	D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
+	/* _drbd_set_state only uses stop_nowait.
+	 * wait here for the Exiting receiver. */
+	drbd_thread_stop(&mdev->receiver);
+	drbd_mdev_cleanup(mdev);
+
+	dev_info(DEV, "worker terminated\n");
+
+	clear_bit(DEVICE_DYING, &mdev->flags);
+	clear_bit(CONFIG_PENDING, &mdev->flags);
+	wake_up(&mdev->state_wait);
+
+	return 0;
+}
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h
new file mode 100644
index 000000000000..f93fa111ce50
--- /dev/null
+++ b/drivers/block/drbd/drbd_wrappers.h
@@ -0,0 +1,91 @@
+#ifndef _DRBD_WRAPPERS_H
+#define _DRBD_WRAPPERS_H
+
+#include <linux/ctype.h>
+#include <linux/mm.h>
+
+/* see get_sb_bdev and bd_claim */
+extern char *drbd_sec_holder;
+
+/* sets the number of 512 byte sectors of our virtual device */
+static inline void drbd_set_my_capacity(struct drbd_conf *mdev,
+					sector_t size)
+{
+	/* set_capacity(mdev->this_bdev->bd_disk, size); */
+	set_capacity(mdev->vdisk, size);
+	mdev->this_bdev->bd_inode->i_size = (loff_t)size << 9;
+}
+
+#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE)
+
+static inline int drbd_bio_has_active_page(struct bio *bio)
+{
+	struct bio_vec *bvec;
+	int i;
+
+	__bio_for_each_segment(bvec, bio, i, 0) {
+		if (page_count(bvec->bv_page) > 1)
+			return 1;
+	}
+
+	return 0;
+}
+
+/* bi_end_io handlers */
+extern void drbd_md_io_complete(struct bio *bio, int error);
+extern void drbd_endio_read_sec(struct bio *bio, int error);
+extern void drbd_endio_write_sec(struct bio *bio, int error);
+extern void drbd_endio_pri(struct bio *bio, int error);
+
+/*
+ * used to submit our private bio
+ */
+static inline void drbd_generic_make_request(struct drbd_conf *mdev,
+					     int fault_type, struct bio *bio)
+{
+	__release(local);
+	if (!bio->bi_bdev) {
+		printk(KERN_ERR "drbd%d: drbd_generic_make_request: "
+				"bio->bi_bdev == NULL\n",
+		       mdev_to_minor(mdev));
+		dump_stack();
+		bio_endio(bio, -ENODEV);
+		return;
+	}
+
+	if (FAULT_ACTIVE(mdev, fault_type))
+		bio_endio(bio, -EIO);
+	else
+		generic_make_request(bio);
+}
+
+static inline void drbd_plug_device(struct drbd_conf *mdev)
+{
+	struct request_queue *q;
+	q = bdev_get_queue(mdev->this_bdev);
+
+	spin_lock_irq(q->queue_lock);
+
+/* XXX the check on !blk_queue_plugged is redundant,
+ * implicitly checked in blk_plug_device */
+
+	if (!blk_queue_plugged(q)) {
+		blk_plug_device(q);
+		del_timer(&q->unplug_timer);
+		/* unplugging should not happen automatically... */
+	}
+	spin_unlock_irq(q->queue_lock);
+}
+
+static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm)
+{
+        return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK)
+                == CRYPTO_ALG_TYPE_HASH;
+}
+
+#ifndef __CHECKER__
+# undef __cond_lock
+# define __cond_lock(x,c) (c)
+#endif
+
+#endif
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
new file mode 100644
index 000000000000..69dc711f37b3
--- /dev/null
+++ b/include/linux/drbd.h
@@ -0,0 +1,349 @@
+/*
+  drbd.h
+  Kernel module for 2.6.x Kernels
+
+  This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+  Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+  Copyright (C) 2001-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+  Copyright (C) 2001-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+  drbd is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2, or (at your option)
+  any later version.
+
+  drbd is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with drbd; see the file COPYING.  If not, write to
+  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+#ifndef DRBD_H
+#define DRBD_H
+#include <linux/connector.h>
+#include <asm/types.h>
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#else
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <limits.h>
+
+/* Altough the Linux source code makes a difference between
+   generic endianness and the bitfields' endianness, there is no
+   architecture as of Linux-2.6.24-rc4 where the bitfileds' endianness
+   does not match the generic endianness. */
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define __LITTLE_ENDIAN_BITFIELD
+#elif __BYTE_ORDER == __BIG_ENDIAN
+#define __BIG_ENDIAN_BITFIELD
+#else
+# error "sorry, weird endianness on this box"
+#endif
+
+#endif
+
+
+extern const char *drbd_buildtag(void);
+#define REL_VERSION "8.3.3rc2"
+#define API_VERSION 88
+#define PRO_VERSION_MIN 86
+#define PRO_VERSION_MAX 91
+
+
+enum drbd_io_error_p {
+	EP_PASS_ON, /* FIXME should the better be named "Ignore"? */
+	EP_CALL_HELPER,
+	EP_DETACH
+};
+
+enum drbd_fencing_p {
+	FP_DONT_CARE,
+	FP_RESOURCE,
+	FP_STONITH
+};
+
+enum drbd_disconnect_p {
+	DP_RECONNECT,
+	DP_DROP_NET_CONF,
+	DP_FREEZE_IO
+};
+
+enum drbd_after_sb_p {
+	ASB_DISCONNECT,
+	ASB_DISCARD_YOUNGER_PRI,
+	ASB_DISCARD_OLDER_PRI,
+	ASB_DISCARD_ZERO_CHG,
+	ASB_DISCARD_LEAST_CHG,
+	ASB_DISCARD_LOCAL,
+	ASB_DISCARD_REMOTE,
+	ASB_CONSENSUS,
+	ASB_DISCARD_SECONDARY,
+	ASB_CALL_HELPER,
+	ASB_VIOLENTLY
+};
+
+/* KEEP the order, do not delete or insert. Only append. */
+enum drbd_ret_codes {
+	ERR_CODE_BASE		= 100,
+	NO_ERROR		= 101,
+	ERR_LOCAL_ADDR		= 102,
+	ERR_PEER_ADDR		= 103,
+	ERR_OPEN_DISK		= 104,
+	ERR_OPEN_MD_DISK	= 105,
+	ERR_DISK_NOT_BDEV	= 107,
+	ERR_MD_NOT_BDEV		= 108,
+	ERR_DISK_TO_SMALL	= 111,
+	ERR_MD_DISK_TO_SMALL	= 112,
+	ERR_BDCLAIM_DISK	= 114,
+	ERR_BDCLAIM_MD_DISK	= 115,
+	ERR_MD_IDX_INVALID	= 116,
+	ERR_IO_MD_DISK		= 118,
+	ERR_MD_INVALID          = 119,
+	ERR_AUTH_ALG		= 120,
+	ERR_AUTH_ALG_ND		= 121,
+	ERR_NOMEM		= 122,
+	ERR_DISCARD		= 123,
+	ERR_DISK_CONFIGURED	= 124,
+	ERR_NET_CONFIGURED	= 125,
+	ERR_MANDATORY_TAG	= 126,
+	ERR_MINOR_INVALID	= 127,
+	ERR_INTR		= 129, /* EINTR */
+	ERR_RESIZE_RESYNC	= 130,
+	ERR_NO_PRIMARY		= 131,
+	ERR_SYNC_AFTER		= 132,
+	ERR_SYNC_AFTER_CYCLE	= 133,
+	ERR_PAUSE_IS_SET	= 134,
+	ERR_PAUSE_IS_CLEAR	= 135,
+	ERR_PACKET_NR		= 137,
+	ERR_NO_DISK		= 138,
+	ERR_NOT_PROTO_C		= 139,
+	ERR_NOMEM_BITMAP	= 140,
+	ERR_INTEGRITY_ALG	= 141, /* DRBD 8.2 only */
+	ERR_INTEGRITY_ALG_ND	= 142, /* DRBD 8.2 only */
+	ERR_CPU_MASK_PARSE	= 143, /* DRBD 8.2 only */
+	ERR_CSUMS_ALG		= 144, /* DRBD 8.2 only */
+	ERR_CSUMS_ALG_ND	= 145, /* DRBD 8.2 only */
+	ERR_VERIFY_ALG		= 146, /* DRBD 8.2 only */
+	ERR_VERIFY_ALG_ND	= 147, /* DRBD 8.2 only */
+	ERR_CSUMS_RESYNC_RUNNING= 148, /* DRBD 8.2 only */
+	ERR_VERIFY_RUNNING	= 149, /* DRBD 8.2 only */
+	ERR_DATA_NOT_CURRENT	= 150,
+	ERR_CONNECTED		= 151, /* DRBD 8.3 only */
+
+	/* insert new ones above this line */
+	AFTER_LAST_ERR_CODE
+};
+
+#define DRBD_PROT_A   1
+#define DRBD_PROT_B   2
+#define DRBD_PROT_C   3
+
+enum drbd_role {
+	R_UNKNOWN = 0,
+	R_PRIMARY = 1,     /* role */
+	R_SECONDARY = 2,   /* role */
+	R_MASK = 3,
+};
+
+/* The order of these constants is important.
+ * The lower ones (<C_WF_REPORT_PARAMS) indicate
+ * that there is no socket!
+ * >=C_WF_REPORT_PARAMS ==> There is a socket
+ */
+enum drbd_conns {
+	C_STANDALONE,
+	C_DISCONNECTING,  /* Temporal state on the way to StandAlone. */
+	C_UNCONNECTED,    /* >= C_UNCONNECTED -> inc_net() succeeds */
+
+	/* These temporal states are all used on the way
+	 * from >= C_CONNECTED to Unconnected.
+	 * The 'disconnect reason' states
+	 * I do not allow to change beween them. */
+	C_TIMEOUT,
+	C_BROKEN_PIPE,
+	C_NETWORK_FAILURE,
+	C_PROTOCOL_ERROR,
+	C_TEAR_DOWN,
+
+	C_WF_CONNECTION,
+	C_WF_REPORT_PARAMS, /* we have a socket */
+	C_CONNECTED,      /* we have introduced each other */
+	C_STARTING_SYNC_S,  /* starting full sync by admin request. */
+	C_STARTING_SYNC_T,  /* stariing full sync by admin request. */
+	C_WF_BITMAP_S,
+	C_WF_BITMAP_T,
+	C_WF_SYNC_UUID,
+
+	/* All SyncStates are tested with this comparison
+	 * xx >= C_SYNC_SOURCE && xx <= C_PAUSED_SYNC_T */
+	C_SYNC_SOURCE,
+	C_SYNC_TARGET,
+	C_VERIFY_S,
+	C_VERIFY_T,
+	C_PAUSED_SYNC_S,
+	C_PAUSED_SYNC_T,
+	C_MASK = 31
+};
+
+enum drbd_disk_state {
+	D_DISKLESS,
+	D_ATTACHING,      /* In the process of reading the meta-data */
+	D_FAILED,         /* Becomes D_DISKLESS as soon as we told it the peer */
+			/* when >= D_FAILED it is legal to access mdev->bc */
+	D_NEGOTIATING,    /* Late attaching state, we need to talk to the peer */
+	D_INCONSISTENT,
+	D_OUTDATED,
+	D_UNKNOWN,       /* Only used for the peer, never for myself */
+	D_CONSISTENT,     /* Might be D_OUTDATED, might be D_UP_TO_DATE ... */
+	D_UP_TO_DATE,       /* Only this disk state allows applications' IO ! */
+	D_MASK = 15
+};
+
+union drbd_state {
+/* According to gcc's docs is the ...
+ * The order of allocation of bit-fields within a unit (C90 6.5.2.1, C99 6.7.2.1).
+ * Determined by ABI.
+ * pointed out by Maxim Uvarov q<muvarov@ru.mvista.com>
+ * even though we transmit as "cpu_to_be32(state)",
+ * the offsets of the bitfields still need to be swapped
+ * on different endianess.
+ */
+	struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+		unsigned role:2 ;   /* 3/4	 primary/secondary/unknown */
+		unsigned peer:2 ;   /* 3/4	 primary/secondary/unknown */
+		unsigned conn:5 ;   /* 17/32	 cstates */
+		unsigned disk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
+		unsigned pdsk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
+		unsigned susp:1 ;   /* 2/2	 IO suspended  no/yes */
+		unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
+		unsigned peer_isp:1 ;
+		unsigned user_isp:1 ;
+		unsigned _pad:11;   /* 0	 unused */
+#elif defined(__BIG_ENDIAN_BITFIELD)
+		unsigned _pad:11;   /* 0	 unused */
+		unsigned user_isp:1 ;
+		unsigned peer_isp:1 ;
+		unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
+		unsigned susp:1 ;   /* 2/2	 IO suspended  no/yes */
+		unsigned pdsk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
+		unsigned disk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
+		unsigned conn:5 ;   /* 17/32	 cstates */
+		unsigned peer:2 ;   /* 3/4	 primary/secondary/unknown */
+		unsigned role:2 ;   /* 3/4	 primary/secondary/unknown */
+#else
+# error "this endianess is not supported"
+#endif
+	};
+	unsigned int i;
+};
+
+enum drbd_state_ret_codes {
+	SS_CW_NO_NEED = 4,
+	SS_CW_SUCCESS = 3,
+	SS_NOTHING_TO_DO = 2,
+	SS_SUCCESS = 1,
+	SS_UNKNOWN_ERROR = 0, /* Used to sleep longer in _drbd_request_state */
+	SS_TWO_PRIMARIES = -1,
+	SS_NO_UP_TO_DATE_DISK = -2,
+	SS_NO_LOCAL_DISK = -4,
+	SS_NO_REMOTE_DISK = -5,
+	SS_CONNECTED_OUTDATES = -6,
+	SS_PRIMARY_NOP = -7,
+	SS_RESYNC_RUNNING = -8,
+	SS_ALREADY_STANDALONE = -9,
+	SS_CW_FAILED_BY_PEER = -10,
+	SS_IS_DISKLESS = -11,
+	SS_DEVICE_IN_USE = -12,
+	SS_NO_NET_CONFIG = -13,
+	SS_NO_VERIFY_ALG = -14,       /* drbd-8.2 only */
+	SS_NEED_CONNECTION = -15,    /* drbd-8.2 only */
+	SS_LOWER_THAN_OUTDATED = -16,
+	SS_NOT_SUPPORTED = -17,      /* drbd-8.2 only */
+	SS_IN_TRANSIENT_STATE = -18,  /* Retry after the next state change */
+	SS_CONCURRENT_ST_CHG = -19,   /* Concurrent cluster side state change! */
+	SS_AFTER_LAST_ERROR = -20,    /* Keep this at bottom */
+};
+
+/* from drbd_strings.c */
+extern const char *drbd_conn_str(enum drbd_conns);
+extern const char *drbd_role_str(enum drbd_role);
+extern const char *drbd_disk_str(enum drbd_disk_state);
+extern const char *drbd_set_st_err_str(enum drbd_state_ret_codes);
+
+#define SHARED_SECRET_MAX 64
+
+#define MDF_CONSISTENT		(1 << 0)
+#define MDF_PRIMARY_IND		(1 << 1)
+#define MDF_CONNECTED_IND	(1 << 2)
+#define MDF_FULL_SYNC		(1 << 3)
+#define MDF_WAS_UP_TO_DATE	(1 << 4)
+#define MDF_PEER_OUT_DATED	(1 << 5)
+#define MDF_CRASHED_PRIMARY     (1 << 6)
+
+enum drbd_uuid_index {
+	UI_CURRENT,
+	UI_BITMAP,
+	UI_HISTORY_START,
+	UI_HISTORY_END,
+	UI_SIZE,      /* nl-packet: number of dirty bits */
+	UI_FLAGS,     /* nl-packet: flags */
+	UI_EXTENDED_SIZE   /* Everything. */
+};
+
+enum drbd_timeout_flag {
+	UT_DEFAULT      = 0,
+	UT_DEGRADED     = 1,
+	UT_PEER_OUTDATED = 2,
+};
+
+#define UUID_JUST_CREATED ((__u64)4)
+
+#define DRBD_MAGIC 0x83740267
+#define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC)
+
+/* these are of type "int" */
+#define DRBD_MD_INDEX_INTERNAL -1
+#define DRBD_MD_INDEX_FLEX_EXT -2
+#define DRBD_MD_INDEX_FLEX_INT -3
+
+/* Start of the new netlink/connector stuff */
+
+#define DRBD_NL_CREATE_DEVICE 0x01
+#define DRBD_NL_SET_DEFAULTS  0x02
+
+/* The following line should be moved over to linux/connector.h
+ * when the time comes */
+#ifndef CN_IDX_DRBD
+# define CN_IDX_DRBD			0x4
+/* Ubuntu "intrepid ibex" release defined CN_IDX_DRBD as 0x6 */
+#endif
+#define CN_VAL_DRBD			0x1
+
+/* For searching a vacant cn_idx value */
+#define CN_IDX_STEP			6977
+
+struct drbd_nl_cfg_req {
+	int packet_type;
+	unsigned int drbd_minor;
+	int flags;
+	unsigned short tag_list[];
+};
+
+struct drbd_nl_cfg_reply {
+	int packet_type;
+	unsigned int minor;
+	int ret_code; /* enum ret_code or set_st_err_t */
+	unsigned short tag_list[]; /* only used with get_* calls */
+};
+
+#endif
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
new file mode 100644
index 000000000000..9d067ce46960
--- /dev/null
+++ b/include/linux/drbd_limits.h
@@ -0,0 +1,137 @@
+/*
+  drbd_limits.h
+  This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+*/
+
+/*
+ * Our current limitations.
+ * Some of them are hard limits,
+ * some of them are arbitrary range limits, that make it easier to provide
+ * feedback about nonsense settings for certain configurable values.
+ */
+
+#ifndef DRBD_LIMITS_H
+#define DRBD_LIMITS_H 1
+
+#define DEBUG_RANGE_CHECK 0
+
+#define DRBD_MINOR_COUNT_MIN 1
+#define DRBD_MINOR_COUNT_MAX 255
+
+#define DRBD_DIALOG_REFRESH_MIN 0
+#define DRBD_DIALOG_REFRESH_MAX 600
+
+/* valid port number */
+#define DRBD_PORT_MIN 1
+#define DRBD_PORT_MAX 0xffff
+
+/* startup { */
+  /* if you want more than 3.4 days, disable */
+#define DRBD_WFC_TIMEOUT_MIN 0
+#define DRBD_WFC_TIMEOUT_MAX 300000
+#define DRBD_WFC_TIMEOUT_DEF 0
+
+#define DRBD_DEGR_WFC_TIMEOUT_MIN 0
+#define DRBD_DEGR_WFC_TIMEOUT_MAX 300000
+#define DRBD_DEGR_WFC_TIMEOUT_DEF 0
+
+#define DRBD_OUTDATED_WFC_TIMEOUT_MIN 0
+#define DRBD_OUTDATED_WFC_TIMEOUT_MAX 300000
+#define DRBD_OUTDATED_WFC_TIMEOUT_DEF 0
+/* }*/
+
+/* net { */
+  /* timeout, unit centi seconds
+   * more than one minute timeout is not usefull */
+#define DRBD_TIMEOUT_MIN 1
+#define DRBD_TIMEOUT_MAX 600
+#define DRBD_TIMEOUT_DEF 60       /* 6 seconds */
+
+  /* active connection retries when C_WF_CONNECTION */
+#define DRBD_CONNECT_INT_MIN 1
+#define DRBD_CONNECT_INT_MAX 120
+#define DRBD_CONNECT_INT_DEF 10   /* seconds */
+
+  /* keep-alive probes when idle */
+#define DRBD_PING_INT_MIN 1
+#define DRBD_PING_INT_MAX 120
+#define DRBD_PING_INT_DEF 10
+
+ /* timeout for the ping packets.*/
+#define DRBD_PING_TIMEO_MIN  1
+#define DRBD_PING_TIMEO_MAX  100
+#define DRBD_PING_TIMEO_DEF  5
+
+  /* max number of write requests between write barriers */
+#define DRBD_MAX_EPOCH_SIZE_MIN 1
+#define DRBD_MAX_EPOCH_SIZE_MAX 20000
+#define DRBD_MAX_EPOCH_SIZE_DEF 2048
+
+  /* I don't think that a tcp send buffer of more than 10M is usefull */
+#define DRBD_SNDBUF_SIZE_MIN  0
+#define DRBD_SNDBUF_SIZE_MAX  (10<<20)
+#define DRBD_SNDBUF_SIZE_DEF  (2*65535)
+
+#define DRBD_RCVBUF_SIZE_MIN  0
+#define DRBD_RCVBUF_SIZE_MAX  (10<<20)
+#define DRBD_RCVBUF_SIZE_DEF  (2*65535)
+
+  /* @4k PageSize -> 128kB - 512MB */
+#define DRBD_MAX_BUFFERS_MIN  32
+#define DRBD_MAX_BUFFERS_MAX  131072
+#define DRBD_MAX_BUFFERS_DEF  2048
+
+  /* @4k PageSize -> 4kB - 512MB */
+#define DRBD_UNPLUG_WATERMARK_MIN  1
+#define DRBD_UNPLUG_WATERMARK_MAX  131072
+#define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16)
+
+  /* 0 is disabled.
+   * 200 should be more than enough even for very short timeouts */
+#define DRBD_KO_COUNT_MIN  0
+#define DRBD_KO_COUNT_MAX  200
+#define DRBD_KO_COUNT_DEF  0
+/* } */
+
+/* syncer { */
+  /* FIXME allow rate to be zero? */
+#define DRBD_RATE_MIN 1
+/* channel bonding 10 GbE, or other hardware */
+#define DRBD_RATE_MAX (4 << 20)
+#define DRBD_RATE_DEF 250  /* kb/second */
+
+  /* less than 7 would hit performance unneccessarily.
+   * 3833 is the largest prime that still does fit
+   * into 64 sectors of activity log */
+#define DRBD_AL_EXTENTS_MIN  7
+#define DRBD_AL_EXTENTS_MAX  3833
+#define DRBD_AL_EXTENTS_DEF  127
+
+#define DRBD_AFTER_MIN  -1
+#define DRBD_AFTER_MAX  255
+#define DRBD_AFTER_DEF  -1
+
+/* } */
+
+/* drbdsetup XY resize -d Z
+ * you are free to reduce the device size to nothing, if you want to.
+ * the upper limit with 64bit kernel, enough ram and flexible meta data
+ * is 16 TB, currently. */
+/* DRBD_MAX_SECTORS */
+#define DRBD_DISK_SIZE_SECT_MIN  0
+#define DRBD_DISK_SIZE_SECT_MAX  (16 * (2LLU << 30))
+#define DRBD_DISK_SIZE_SECT_DEF  0 /* = disabled = no user size... */
+
+#define DRBD_ON_IO_ERROR_DEF EP_PASS_ON
+#define DRBD_FENCING_DEF FP_DONT_CARE
+#define DRBD_AFTER_SB_0P_DEF ASB_DISCONNECT
+#define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT
+#define DRBD_AFTER_SB_2P_DEF ASB_DISCONNECT
+#define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT
+
+#define DRBD_MAX_BIO_BVECS_MIN 0
+#define DRBD_MAX_BIO_BVECS_MAX 128
+#define DRBD_MAX_BIO_BVECS_DEF 0
+
+#undef RANGE
+#endif
diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h
new file mode 100644
index 000000000000..db5721ad50d1
--- /dev/null
+++ b/include/linux/drbd_nl.h
@@ -0,0 +1,137 @@
+/*
+   PAKET( name,
+	  TYPE ( pn, pr, member )
+	  ...
+   )
+
+   You may never reissue one of the pn arguments
+*/
+
+#if !defined(NL_PACKET) || !defined(NL_STRING) || !defined(NL_INTEGER) || !defined(NL_BIT) || !defined(NL_INT64)
+#error "The macros NL_PACKET, NL_STRING, NL_INTEGER, NL_INT64 and NL_BIT needs to be defined"
+#endif
+
+NL_PACKET(primary, 1,
+       NL_BIT(		1,	T_MAY_IGNORE,	overwrite_peer)
+)
+
+NL_PACKET(secondary, 2, )
+
+NL_PACKET(disk_conf, 3,
+	NL_INT64(	2,	T_MAY_IGNORE,	disk_size)
+	NL_STRING(	3,	T_MANDATORY,	backing_dev,	128)
+	NL_STRING(	4,	T_MANDATORY,	meta_dev,	128)
+	NL_INTEGER(	5,	T_MANDATORY,	meta_dev_idx)
+	NL_INTEGER(	6,	T_MAY_IGNORE,	on_io_error)
+	NL_INTEGER(	7,	T_MAY_IGNORE,	fencing)
+	NL_BIT(		37,	T_MAY_IGNORE,	use_bmbv)
+	NL_BIT(		53,	T_MAY_IGNORE,	no_disk_flush)
+	NL_BIT(		54,	T_MAY_IGNORE,	no_md_flush)
+	  /*  55 max_bio_size was available in 8.2.6rc2 */
+	NL_INTEGER(	56,	T_MAY_IGNORE,	max_bio_bvecs)
+	NL_BIT(		57,	T_MAY_IGNORE,	no_disk_barrier)
+	NL_BIT(		58,	T_MAY_IGNORE,	no_disk_drain)
+)
+
+NL_PACKET(detach, 4, )
+
+NL_PACKET(net_conf, 5,
+	NL_STRING(	8,	T_MANDATORY,	my_addr,	128)
+	NL_STRING(	9,	T_MANDATORY,	peer_addr,	128)
+	NL_STRING(	10,	T_MAY_IGNORE,	shared_secret,	SHARED_SECRET_MAX)
+	NL_STRING(	11,	T_MAY_IGNORE,	cram_hmac_alg,	SHARED_SECRET_MAX)
+	NL_STRING(	44,	T_MAY_IGNORE,	integrity_alg,	SHARED_SECRET_MAX)
+	NL_INTEGER(	14,	T_MAY_IGNORE,	timeout)
+	NL_INTEGER(	15,	T_MANDATORY,	wire_protocol)
+	NL_INTEGER(	16,	T_MAY_IGNORE,	try_connect_int)
+	NL_INTEGER(	17,	T_MAY_IGNORE,	ping_int)
+	NL_INTEGER(	18,	T_MAY_IGNORE,	max_epoch_size)
+	NL_INTEGER(	19,	T_MAY_IGNORE,	max_buffers)
+	NL_INTEGER(	20,	T_MAY_IGNORE,	unplug_watermark)
+	NL_INTEGER(	21,	T_MAY_IGNORE,	sndbuf_size)
+	NL_INTEGER(	22,	T_MAY_IGNORE,	ko_count)
+	NL_INTEGER(	24,	T_MAY_IGNORE,	after_sb_0p)
+	NL_INTEGER(	25,	T_MAY_IGNORE,	after_sb_1p)
+	NL_INTEGER(	26,	T_MAY_IGNORE,	after_sb_2p)
+	NL_INTEGER(	39,	T_MAY_IGNORE,	rr_conflict)
+	NL_INTEGER(	40,	T_MAY_IGNORE,	ping_timeo)
+	NL_INTEGER(	67,	T_MAY_IGNORE,	rcvbuf_size)
+	  /* 59 addr_family was available in GIT, never released */
+	NL_BIT(		60,	T_MANDATORY,	mind_af)
+	NL_BIT(		27,	T_MAY_IGNORE,	want_lose)
+	NL_BIT(		28,	T_MAY_IGNORE,	two_primaries)
+	NL_BIT(		41,	T_MAY_IGNORE,	always_asbp)
+	NL_BIT(		61,	T_MAY_IGNORE,	no_cork)
+	NL_BIT(		62,	T_MANDATORY,	auto_sndbuf_size)
+)
+
+NL_PACKET(disconnect, 6, )
+
+NL_PACKET(resize, 7,
+	NL_INT64(		29,	T_MAY_IGNORE,	resize_size)
+)
+
+NL_PACKET(syncer_conf, 8,
+	NL_INTEGER(	30,	T_MAY_IGNORE,	rate)
+	NL_INTEGER(	31,	T_MAY_IGNORE,	after)
+	NL_INTEGER(	32,	T_MAY_IGNORE,	al_extents)
+	NL_STRING(      52,     T_MAY_IGNORE,   verify_alg,     SHARED_SECRET_MAX)
+	NL_STRING(      51,     T_MAY_IGNORE,   cpu_mask,       32)
+	NL_STRING(	64,	T_MAY_IGNORE,	csums_alg,	SHARED_SECRET_MAX)
+	NL_BIT(         65,     T_MAY_IGNORE,   use_rle)
+)
+
+NL_PACKET(invalidate, 9, )
+NL_PACKET(invalidate_peer, 10, )
+NL_PACKET(pause_sync, 11, )
+NL_PACKET(resume_sync, 12, )
+NL_PACKET(suspend_io, 13, )
+NL_PACKET(resume_io, 14, )
+NL_PACKET(outdate, 15, )
+NL_PACKET(get_config, 16, )
+NL_PACKET(get_state, 17,
+	NL_INTEGER(	33,	T_MAY_IGNORE,	state_i)
+)
+
+NL_PACKET(get_uuids, 18,
+	NL_STRING(	34,	T_MAY_IGNORE,	uuids,	(UI_SIZE*sizeof(__u64)))
+	NL_INTEGER(	35,	T_MAY_IGNORE,	uuids_flags)
+)
+
+NL_PACKET(get_timeout_flag, 19,
+	NL_BIT(		36,	T_MAY_IGNORE,	use_degraded)
+)
+
+NL_PACKET(call_helper, 20,
+	NL_STRING(	38,	T_MAY_IGNORE,	helper,		32)
+)
+
+/* Tag nr 42 already allocated in drbd-8.1 development. */
+
+NL_PACKET(sync_progress, 23,
+	NL_INTEGER(	43,	T_MAY_IGNORE,	sync_progress)
+)
+
+NL_PACKET(dump_ee, 24,
+	NL_STRING(	45,	T_MAY_IGNORE,	dump_ee_reason, 32)
+	NL_STRING(	46,	T_MAY_IGNORE,	seen_digest, SHARED_SECRET_MAX)
+	NL_STRING(	47,	T_MAY_IGNORE,	calc_digest, SHARED_SECRET_MAX)
+	NL_INT64(	48,	T_MAY_IGNORE,	ee_sector)
+	NL_INT64(	49,	T_MAY_IGNORE,	ee_block_id)
+	NL_STRING(	50,	T_MAY_IGNORE,	ee_data,	32 << 10)
+)
+
+NL_PACKET(start_ov, 25,
+	NL_INT64(	66,	T_MAY_IGNORE,	start_sector)
+)
+
+NL_PACKET(new_c_uuid, 26,
+       NL_BIT(		63,	T_MANDATORY,	clear_bm)
+)
+
+#undef NL_PACKET
+#undef NL_INTEGER
+#undef NL_INT64
+#undef NL_BIT
+#undef NL_STRING
+
diff --git a/include/linux/drbd_tag_magic.h b/include/linux/drbd_tag_magic.h
new file mode 100644
index 000000000000..fcdff8410e99
--- /dev/null
+++ b/include/linux/drbd_tag_magic.h
@@ -0,0 +1,83 @@
+#ifndef DRBD_TAG_MAGIC_H
+#define DRBD_TAG_MAGIC_H
+
+#define TT_END     0
+#define TT_REMOVED 0xE000
+
+/* declare packet_type enums */
+enum packet_types {
+#define NL_PACKET(name, number, fields) P_ ## name = number,
+#define NL_INTEGER(pn, pr, member)
+#define NL_INT64(pn, pr, member)
+#define NL_BIT(pn, pr, member)
+#define NL_STRING(pn, pr, member, len)
+#include "drbd_nl.h"
+	P_nl_after_last_packet,
+};
+
+/* These struct are used to deduce the size of the tag lists: */
+#define NL_PACKET(name, number, fields)	\
+	struct name ## _tag_len_struct { fields };
+#define NL_INTEGER(pn, pr, member)		\
+	int member; int tag_and_len ## member;
+#define NL_INT64(pn, pr, member)		\
+	__u64 member; int tag_and_len ## member;
+#define NL_BIT(pn, pr, member)		\
+	unsigned char member:1; int tag_and_len ## member;
+#define NL_STRING(pn, pr, member, len)	\
+	unsigned char member[len]; int member ## _len; \
+	int tag_and_len ## member;
+#include "linux/drbd_nl.h"
+
+/* declate tag-list-sizes */
+static const int tag_list_sizes[] = {
+#define NL_PACKET(name, number, fields) 2 fields ,
+#define NL_INTEGER(pn, pr, member)      + 4 + 4
+#define NL_INT64(pn, pr, member)        + 4 + 8
+#define NL_BIT(pn, pr, member)          + 4 + 1
+#define NL_STRING(pn, pr, member, len)  + 4 + (len)
+#include "drbd_nl.h"
+};
+
+/* The two highest bits are used for the tag type */
+#define TT_MASK      0xC000
+#define TT_INTEGER   0x0000
+#define TT_INT64     0x4000
+#define TT_BIT       0x8000
+#define TT_STRING    0xC000
+/* The next bit indicates if processing of the tag is mandatory */
+#define T_MANDATORY  0x2000
+#define T_MAY_IGNORE 0x0000
+#define TN_MASK      0x1fff
+/* The remaining 13 bits are used to enumerate the tags */
+
+#define tag_type(T)   ((T) & TT_MASK)
+#define tag_number(T) ((T) & TN_MASK)
+
+/* declare tag enums */
+#define NL_PACKET(name, number, fields) fields
+enum drbd_tags {
+#define NL_INTEGER(pn, pr, member)     T_ ## member = pn | TT_INTEGER | pr ,
+#define NL_INT64(pn, pr, member)       T_ ## member = pn | TT_INT64   | pr ,
+#define NL_BIT(pn, pr, member)         T_ ## member = pn | TT_BIT     | pr ,
+#define NL_STRING(pn, pr, member, len) T_ ## member = pn | TT_STRING  | pr ,
+#include "drbd_nl.h"
+};
+
+struct tag {
+	const char *name;
+	int type_n_flags;
+	int max_len;
+};
+
+/* declare tag names */
+#define NL_PACKET(name, number, fields) fields
+static const struct tag tag_descriptions[] = {
+#define NL_INTEGER(pn, pr, member)     [ pn ] = { #member, TT_INTEGER | pr, sizeof(int)   },
+#define NL_INT64(pn, pr, member)       [ pn ] = { #member, TT_INT64   | pr, sizeof(__u64) },
+#define NL_BIT(pn, pr, member)         [ pn ] = { #member, TT_BIT     | pr, sizeof(int)   },
+#define NL_STRING(pn, pr, member, len) [ pn ] = { #member, TT_STRING  | pr, (len)         },
+#include "drbd_nl.h"
+};
+
+#endif
diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h
new file mode 100644
index 000000000000..3a2b2d9b0472
--- /dev/null
+++ b/include/linux/lru_cache.h
@@ -0,0 +1,294 @@
+/*
+   lru_cache.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#ifndef LRU_CACHE_H
+#define LRU_CACHE_H
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/string.h> /* for memset */
+#include <linux/seq_file.h>
+
+/*
+This header file (and its .c file; kernel-doc of functions see there)
+  define a helper framework to easily keep track of index:label associations,
+  and changes to an "active set" of objects, as well as pending transactions,
+  to persistently record those changes.
+
+  We use an LRU policy if it is necessary to "cool down" a region currently in
+  the active set before we can "heat" a previously unused region.
+
+  Because of this later property, it is called "lru_cache".
+  As it actually Tracks Objects in an Active SeT, we could also call it
+  toast (incidentally that is what may happen to the data on the
+  backend storage uppon next resync, if we don't get it right).
+
+What for?
+
+We replicate IO (more or less synchronously) to local and remote disk.
+
+For crash recovery after replication node failure,
+  we need to resync all regions that have been target of in-flight WRITE IO
+  (in use, or "hot", regions), as we don't know wether or not those WRITEs have
+  made it to stable storage.
+
+  To avoid a "full resync", we need to persistently track these regions.
+
+  This is known as "write intent log", and can be implemented as on-disk
+  (coarse or fine grained) bitmap, or other meta data.
+
+  To avoid the overhead of frequent extra writes to this meta data area,
+  usually the condition is softened to regions that _may_ have been target of
+  in-flight WRITE IO, e.g. by only lazily clearing the on-disk write-intent
+  bitmap, trading frequency of meta data transactions against amount of
+  (possibly unneccessary) resync traffic.
+
+  If we set a hard limit on the area that may be "hot" at any given time, we
+  limit the amount of resync traffic needed for crash recovery.
+
+For recovery after replication link failure,
+  we need to resync all blocks that have been changed on the other replica
+  in the mean time, or, if both replica have been changed independently [*],
+  all blocks that have been changed on either replica in the mean time.
+  [*] usually as a result of a cluster split-brain and insufficient protection.
+      but there are valid use cases to do this on purpose.
+
+  Tracking those blocks can be implemented as "dirty bitmap".
+  Having it fine-grained reduces the amount of resync traffic.
+  It should also be persistent, to allow for reboots (or crashes)
+  while the replication link is down.
+
+There are various possible implementations for persistently storing
+write intent log information, three of which are mentioned here.
+
+"Chunk dirtying"
+  The on-disk "dirty bitmap" may be re-used as "write-intent" bitmap as well.
+  To reduce the frequency of bitmap updates for write-intent log purposes,
+  one could dirty "chunks" (of some size) at a time of the (fine grained)
+  on-disk bitmap, while keeping the in-memory "dirty" bitmap as clean as
+  possible, flushing it to disk again when a previously "hot" (and on-disk
+  dirtied as full chunk) area "cools down" again (no IO in flight anymore,
+  and none expected in the near future either).
+
+"Explicit (coarse) write intent bitmap"
+  An other implementation could chose a (probably coarse) explicit bitmap,
+  for write-intent log purposes, additionally to the fine grained dirty bitmap.
+
+"Activity log"
+  Yet an other implementation may keep track of the hot regions, by starting
+  with an empty set, and writing down a journal of region numbers that have
+  become "hot", or have "cooled down" again.
+
+  To be able to use a ring buffer for this journal of changes to the active
+  set, we not only record the actual changes to that set, but also record the
+  not changing members of the set in a round robin fashion. To do so, we use a
+  fixed (but configurable) number of slots which we can identify by index, and
+  associate region numbers (labels) with these indices.
+  For each transaction recording a change to the active set, we record the
+  change itself (index: -old_label, +new_label), and which index is associated
+  with which label (index: current_label) within a certain sliding window that
+  is moved further over the available indices with each such transaction.
+
+  Thus, for crash recovery, if the ringbuffer is sufficiently large, we can
+  accurately reconstruct the active set.
+
+  Sufficiently large depends only on maximum number of active objects, and the
+  size of the sliding window recording "index: current_label" associations within
+  each transaction.
+
+  This is what we call the "activity log".
+
+  Currently we need one activity log transaction per single label change, which
+  does not give much benefit over the "dirty chunks of bitmap" approach, other
+  than potentially less seeks.
+
+  We plan to change the transaction format to support multiple changes per
+  transaction, which then would reduce several (disjoint, "random") updates to
+  the bitmap into one transaction to the activity log ring buffer.
+*/
+
+/* this defines an element in a tracked set
+ * .colision is for hash table lookup.
+ * When we process a new IO request, we know its sector, thus can deduce the
+ * region number (label) easily.  To do the label -> object lookup without a
+ * full list walk, we use a simple hash table.
+ *
+ * .list is on one of three lists:
+ *  in_use: currently in use (refcnt > 0, lc_number != LC_FREE)
+ *     lru: unused but ready to be reused or recycled
+ *          (ts_refcnt == 0, lc_number != LC_FREE),
+ *    free: unused but ready to be recycled
+ *          (ts_refcnt == 0, lc_number == LC_FREE),
+ *
+ * an element is said to be "in the active set",
+ * if either on "in_use" or "lru", i.e. lc_number != LC_FREE.
+ *
+ * DRBD currently (May 2009) only uses 61 elements on the resync lru_cache
+ * (total memory usage 2 pages), and up to 3833 elements on the act_log
+ * lru_cache, totalling ~215 kB for 64bit architechture, ~53 pages.
+ *
+ * We usually do not actually free these objects again, but only "recycle"
+ * them, as the change "index: -old_label, +LC_FREE" would need a transaction
+ * as well.  Which also means that using a kmem_cache to allocate the objects
+ * from wastes some resources.
+ * But it avoids high order page allocations in kmalloc.
+ */
+struct lc_element {
+	struct hlist_node colision;
+	struct list_head list;		 /* LRU list or free list */
+	unsigned refcnt;
+	/* back "pointer" into ts_cache->element[index],
+	 * for paranoia, and for "ts_element_to_index" */
+	unsigned lc_index;
+	/* if we want to track a larger set of objects,
+	 * it needs to become arch independend u64 */
+	unsigned lc_number;
+
+	/* special label when on free list */
+#define LC_FREE (~0U)
+};
+
+struct lru_cache {
+	/* the least recently used item is kept at lru->prev */
+	struct list_head lru;
+	struct list_head free;
+	struct list_head in_use;
+
+	/* the pre-created kmem cache to allocate the objects from */
+	struct kmem_cache *lc_cache;
+
+	/* size of tracked objects, used to memset(,0,) them in lc_reset */
+	size_t element_size;
+	/* offset of struct lc_element member in the tracked object */
+	size_t element_off;
+
+	/* number of elements (indices) */
+	unsigned int  nr_elements;
+	/* Arbitrary limit on maximum tracked objects. Practical limit is much
+	 * lower due to allocation failures, probably. For typical use cases,
+	 * nr_elements should be a few thousand at most.
+	 * This also limits the maximum value of ts_element.ts_index, allowing the
+	 * 8 high bits of .ts_index to be overloaded with flags in the future. */
+#define LC_MAX_ACTIVE	(1<<24)
+
+	/* statistics */
+	unsigned used; /* number of lelements currently on in_use list */
+	unsigned long hits, misses, starving, dirty, changed;
+
+	/* see below: flag-bits for lru_cache */
+	unsigned long flags;
+
+	/* when changing the label of an index element */
+	unsigned int  new_number;
+
+	/* for paranoia when changing the label of an index element */
+	struct lc_element *changing_element;
+
+	void  *lc_private;
+	const char *name;
+
+	/* nr_elements there */
+	struct hlist_head *lc_slot;
+	struct lc_element **lc_element;
+};
+
+
+/* flag-bits for lru_cache */
+enum {
+	/* debugging aid, to catch concurrent access early.
+	 * user needs to guarantee exclusive access by proper locking! */
+	__LC_PARANOIA,
+	/* if we need to change the set, but currently there is a changing
+	 * transaction pending, we are "dirty", and must deferr further
+	 * changing requests */
+	__LC_DIRTY,
+	/* if we need to change the set, but currently there is no free nor
+	 * unused element available, we are "starving", and must not give out
+	 * further references, to guarantee that eventually some refcnt will
+	 * drop to zero and we will be able to make progress again, changing
+	 * the set, writing the transaction.
+	 * if the statistics say we are frequently starving,
+	 * nr_elements is too small. */
+	__LC_STARVING,
+};
+#define LC_PARANOIA (1<<__LC_PARANOIA)
+#define LC_DIRTY    (1<<__LC_DIRTY)
+#define LC_STARVING (1<<__LC_STARVING)
+
+extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
+		unsigned e_count, size_t e_size, size_t e_off);
+extern void lc_reset(struct lru_cache *lc);
+extern void lc_destroy(struct lru_cache *lc);
+extern void lc_set(struct lru_cache *lc, unsigned int enr, int index);
+extern void lc_del(struct lru_cache *lc, struct lc_element *element);
+
+extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr);
+extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr);
+extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr);
+extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e);
+extern void lc_changed(struct lru_cache *lc, struct lc_element *e);
+
+struct seq_file;
+extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc);
+
+extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
+				void (*detail) (struct seq_file *, struct lc_element *));
+
+/**
+ * lc_try_lock - can be used to stop lc_get() from changing the tracked set
+ * @lc: the lru cache to operate on
+ *
+ * Note that the reference counts and order on the active and lru lists may
+ * still change.  Returns true if we aquired the lock.
+ */
+static inline int lc_try_lock(struct lru_cache *lc)
+{
+	return !test_and_set_bit(__LC_DIRTY, &lc->flags);
+}
+
+/**
+ * lc_unlock - unlock @lc, allow lc_get() to change the set again
+ * @lc: the lru cache to operate on
+ */
+static inline void lc_unlock(struct lru_cache *lc)
+{
+	clear_bit(__LC_DIRTY, &lc->flags);
+	smp_mb__after_clear_bit();
+}
+
+static inline int lc_is_used(struct lru_cache *lc, unsigned int enr)
+{
+	struct lc_element *e = lc_find(lc, enr);
+	return e && e->refcnt;
+}
+
+#define lc_entry(ptr, type, member) \
+	container_of(ptr, type, member)
+
+extern struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i);
+extern unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e);
+
+#endif
diff --git a/lib/Kconfig b/lib/Kconfig
index bb1326d3839c..1cfe51628e1b 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -200,4 +200,7 @@ config NLATTR
 config GENERIC_ATOMIC64
        bool
 
+config LRU_CACHE
+	tristate
+
 endmenu
diff --git a/lib/Makefile b/lib/Makefile
index 2e78277eff9d..347ad8db29d3 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -91,6 +91,8 @@ obj-$(CONFIG_DYNAMIC_DEBUG) += dynamic_debug.o
 
 obj-$(CONFIG_NLATTR) += nlattr.o
 
+obj-$(CONFIG_LRU_CACHE) += lru_cache.o
+
 obj-$(CONFIG_DMA_API_DEBUG) += dma-debug.o
 
 obj-$(CONFIG_GENERIC_CSUM) += checksum.o
diff --git a/lib/lru_cache.c b/lib/lru_cache.c
new file mode 100644
index 000000000000..270de9d31b8c
--- /dev/null
+++ b/lib/lru_cache.c
@@ -0,0 +1,560 @@
+/*
+   lru_cache.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/module.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/string.h> /* for memset */
+#include <linux/seq_file.h> /* for seq_printf */
+#include <linux/lru_cache.h>
+
+MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
+	      "Lars Ellenberg <lars@linbit.com>");
+MODULE_DESCRIPTION("lru_cache - Track sets of hot objects");
+MODULE_LICENSE("GPL");
+
+/* this is developers aid only.
+ * it catches concurrent access (lack of locking on the users part) */
+#define PARANOIA_ENTRY() do {		\
+	BUG_ON(!lc);			\
+	BUG_ON(!lc->nr_elements);	\
+	BUG_ON(test_and_set_bit(__LC_PARANOIA, &lc->flags)); \
+} while (0)
+
+#define RETURN(x...)     do { \
+	clear_bit(__LC_PARANOIA, &lc->flags); \
+	smp_mb__after_clear_bit(); return x ; } while (0)
+
+/* BUG() if e is not one of the elements tracked by lc */
+#define PARANOIA_LC_ELEMENT(lc, e) do {	\
+	struct lru_cache *lc_ = (lc);	\
+	struct lc_element *e_ = (e);	\
+	unsigned i = e_->lc_index;	\
+	BUG_ON(i >= lc_->nr_elements);	\
+	BUG_ON(lc_->lc_element[i] != e_); } while (0)
+
+/**
+ * lc_create - prepares to track objects in an active set
+ * @name: descriptive name only used in lc_seq_printf_stats and lc_seq_dump_details
+ * @e_count: number of elements allowed to be active simultaneously
+ * @e_size: size of the tracked objects
+ * @e_off: offset to the &struct lc_element member in a tracked object
+ *
+ * Returns a pointer to a newly initialized struct lru_cache on success,
+ * or NULL on (allocation) failure.
+ */
+struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
+		unsigned e_count, size_t e_size, size_t e_off)
+{
+	struct hlist_head *slot = NULL;
+	struct lc_element **element = NULL;
+	struct lru_cache *lc;
+	struct lc_element *e;
+	unsigned cache_obj_size = kmem_cache_size(cache);
+	unsigned i;
+
+	WARN_ON(cache_obj_size < e_size);
+	if (cache_obj_size < e_size)
+		return NULL;
+
+	/* e_count too big; would probably fail the allocation below anyways.
+	 * for typical use cases, e_count should be few thousand at most. */
+	if (e_count > LC_MAX_ACTIVE)
+		return NULL;
+
+	slot = kzalloc(e_count * sizeof(struct hlist_head*), GFP_KERNEL);
+	if (!slot)
+		goto out_fail;
+	element = kzalloc(e_count * sizeof(struct lc_element *), GFP_KERNEL);
+	if (!element)
+		goto out_fail;
+
+	lc = kzalloc(sizeof(*lc), GFP_KERNEL);
+	if (!lc)
+		goto out_fail;
+
+	INIT_LIST_HEAD(&lc->in_use);
+	INIT_LIST_HEAD(&lc->lru);
+	INIT_LIST_HEAD(&lc->free);
+
+	lc->name = name;
+	lc->element_size = e_size;
+	lc->element_off = e_off;
+	lc->nr_elements = e_count;
+	lc->new_number = LC_FREE;
+	lc->lc_cache = cache;
+	lc->lc_element = element;
+	lc->lc_slot = slot;
+
+	/* preallocate all objects */
+	for (i = 0; i < e_count; i++) {
+		void *p = kmem_cache_alloc(cache, GFP_KERNEL);
+		if (!p)
+			break;
+		memset(p, 0, lc->element_size);
+		e = p + e_off;
+		e->lc_index = i;
+		e->lc_number = LC_FREE;
+		list_add(&e->list, &lc->free);
+		element[i] = e;
+	}
+	if (i == e_count)
+		return lc;
+
+	/* else: could not allocate all elements, give up */
+	for (i--; i; i--) {
+		void *p = element[i];
+		kmem_cache_free(cache, p - e_off);
+	}
+	kfree(lc);
+out_fail:
+	kfree(element);
+	kfree(slot);
+	return NULL;
+}
+
+void lc_free_by_index(struct lru_cache *lc, unsigned i)
+{
+	void *p = lc->lc_element[i];
+	WARN_ON(!p);
+	if (p) {
+		p -= lc->element_off;
+		kmem_cache_free(lc->lc_cache, p);
+	}
+}
+
+/**
+ * lc_destroy - frees memory allocated by lc_create()
+ * @lc: the lru cache to destroy
+ */
+void lc_destroy(struct lru_cache *lc)
+{
+	unsigned i;
+	if (!lc)
+		return;
+	for (i = 0; i < lc->nr_elements; i++)
+		lc_free_by_index(lc, i);
+	kfree(lc->lc_element);
+	kfree(lc->lc_slot);
+	kfree(lc);
+}
+
+/**
+ * lc_reset - does a full reset for @lc and the hash table slots.
+ * @lc: the lru cache to operate on
+ *
+ * It is roughly the equivalent of re-allocating a fresh lru_cache object,
+ * basically a short cut to lc_destroy(lc); lc = lc_create(...);
+ */
+void lc_reset(struct lru_cache *lc)
+{
+	unsigned i;
+
+	INIT_LIST_HEAD(&lc->in_use);
+	INIT_LIST_HEAD(&lc->lru);
+	INIT_LIST_HEAD(&lc->free);
+	lc->used = 0;
+	lc->hits = 0;
+	lc->misses = 0;
+	lc->starving = 0;
+	lc->dirty = 0;
+	lc->changed = 0;
+	lc->flags = 0;
+	lc->changing_element = NULL;
+	lc->new_number = LC_FREE;
+	memset(lc->lc_slot, 0, sizeof(struct hlist_head) * lc->nr_elements);
+
+	for (i = 0; i < lc->nr_elements; i++) {
+		struct lc_element *e = lc->lc_element[i];
+		void *p = e;
+		p -= lc->element_off;
+		memset(p, 0, lc->element_size);
+		/* re-init it */
+		e->lc_index = i;
+		e->lc_number = LC_FREE;
+		list_add(&e->list, &lc->free);
+	}
+}
+
+/**
+ * lc_seq_printf_stats - print stats about @lc into @seq
+ * @seq: the seq_file to print into
+ * @lc: the lru cache to print statistics of
+ */
+size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc)
+{
+	/* NOTE:
+	 * total calls to lc_get are
+	 * (starving + hits + misses)
+	 * misses include "dirty" count (update from an other thread in
+	 * progress) and "changed", when this in fact lead to an successful
+	 * update of the cache.
+	 */
+	return seq_printf(seq, "\t%s: used:%u/%u "
+		"hits:%lu misses:%lu starving:%lu dirty:%lu changed:%lu\n",
+		lc->name, lc->used, lc->nr_elements,
+		lc->hits, lc->misses, lc->starving, lc->dirty, lc->changed);
+}
+
+static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr)
+{
+	return  lc->lc_slot + (enr % lc->nr_elements);
+}
+
+
+/**
+ * lc_find - find element by label, if present in the hash table
+ * @lc: The lru_cache object
+ * @enr: element number
+ *
+ * Returns the pointer to an element, if the element with the requested
+ * "label" or element number is present in the hash table,
+ * or NULL if not found. Does not change the refcnt.
+ */
+struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr)
+{
+	struct hlist_node *n;
+	struct lc_element *e;
+
+	BUG_ON(!lc);
+	BUG_ON(!lc->nr_elements);
+	hlist_for_each_entry(e, n, lc_hash_slot(lc, enr), colision) {
+		if (e->lc_number == enr)
+			return e;
+	}
+	return NULL;
+}
+
+/* returned element will be "recycled" immediately */
+static struct lc_element *lc_evict(struct lru_cache *lc)
+{
+	struct list_head  *n;
+	struct lc_element *e;
+
+	if (list_empty(&lc->lru))
+		return NULL;
+
+	n = lc->lru.prev;
+	e = list_entry(n, struct lc_element, list);
+
+	PARANOIA_LC_ELEMENT(lc, e);
+
+	list_del(&e->list);
+	hlist_del(&e->colision);
+	return e;
+}
+
+/**
+ * lc_del - removes an element from the cache
+ * @lc: The lru_cache object
+ * @e: The element to remove
+ *
+ * @e must be unused (refcnt == 0). Moves @e from "lru" to "free" list,
+ * sets @e->enr to %LC_FREE.
+ */
+void lc_del(struct lru_cache *lc, struct lc_element *e)
+{
+	PARANOIA_ENTRY();
+	PARANOIA_LC_ELEMENT(lc, e);
+	BUG_ON(e->refcnt);
+
+	e->lc_number = LC_FREE;
+	hlist_del_init(&e->colision);
+	list_move(&e->list, &lc->free);
+	RETURN();
+}
+
+static struct lc_element *lc_get_unused_element(struct lru_cache *lc)
+{
+	struct list_head *n;
+
+	if (list_empty(&lc->free))
+		return lc_evict(lc);
+
+	n = lc->free.next;
+	list_del(n);
+	return list_entry(n, struct lc_element, list);
+}
+
+static int lc_unused_element_available(struct lru_cache *lc)
+{
+	if (!list_empty(&lc->free))
+		return 1; /* something on the free list */
+	if (!list_empty(&lc->lru))
+		return 1;  /* something to evict */
+
+	return 0;
+}
+
+
+/**
+ * lc_get - get element by label, maybe change the active set
+ * @lc: the lru cache to operate on
+ * @enr: the label to look up
+ *
+ * Finds an element in the cache, increases its usage count,
+ * "touches" and returns it.
+ *
+ * In case the requested number is not present, it needs to be added to the
+ * cache. Therefore it is possible that an other element becomes evicted from
+ * the cache. In either case, the user is notified so he is able to e.g. keep
+ * a persistent log of the cache changes, and therefore the objects in use.
+ *
+ * Return values:
+ *  NULL
+ *     The cache was marked %LC_STARVING,
+ *     or the requested label was not in the active set
+ *     and a changing transaction is still pending (@lc was marked %LC_DIRTY).
+ *     Or no unused or free element could be recycled (@lc will be marked as
+ *     %LC_STARVING, blocking further lc_get() operations).
+ *
+ *  pointer to the element with the REQUESTED element number.
+ *     In this case, it can be used right away
+ *
+ *  pointer to an UNUSED element with some different element number,
+ *          where that different number may also be %LC_FREE.
+ *
+ *          In this case, the cache is marked %LC_DIRTY (blocking further changes),
+ *          and the returned element pointer is removed from the lru list and
+ *          hash collision chains.  The user now should do whatever housekeeping
+ *          is necessary.
+ *          Then he must call lc_changed(lc,element_pointer), to finish
+ *          the change.
+ *
+ * NOTE: The user needs to check the lc_number on EACH use, so he recognizes
+ *       any cache set change.
+ */
+struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr)
+{
+	struct lc_element *e;
+
+	PARANOIA_ENTRY();
+	if (lc->flags & LC_STARVING) {
+		++lc->starving;
+		RETURN(NULL);
+	}
+
+	e = lc_find(lc, enr);
+	if (e) {
+		++lc->hits;
+		if (e->refcnt++ == 0)
+			lc->used++;
+		list_move(&e->list, &lc->in_use); /* Not evictable... */
+		RETURN(e);
+	}
+
+	++lc->misses;
+
+	/* In case there is nothing available and we can not kick out
+	 * the LRU element, we have to wait ...
+	 */
+	if (!lc_unused_element_available(lc)) {
+		__set_bit(__LC_STARVING, &lc->flags);
+		RETURN(NULL);
+	}
+
+	/* it was not present in the active set.
+	 * we are going to recycle an unused (or even "free") element.
+	 * user may need to commit a transaction to record that change.
+	 * we serialize on flags & TF_DIRTY */
+	if (test_and_set_bit(__LC_DIRTY, &lc->flags)) {
+		++lc->dirty;
+		RETURN(NULL);
+	}
+
+	e = lc_get_unused_element(lc);
+	BUG_ON(!e);
+
+	clear_bit(__LC_STARVING, &lc->flags);
+	BUG_ON(++e->refcnt != 1);
+	lc->used++;
+
+	lc->changing_element = e;
+	lc->new_number = enr;
+
+	RETURN(e);
+}
+
+/* similar to lc_get,
+ * but only gets a new reference on an existing element.
+ * you either get the requested element, or NULL.
+ * will be consolidated into one function.
+ */
+struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr)
+{
+	struct lc_element *e;
+
+	PARANOIA_ENTRY();
+	if (lc->flags & LC_STARVING) {
+		++lc->starving;
+		RETURN(NULL);
+	}
+
+	e = lc_find(lc, enr);
+	if (e) {
+		++lc->hits;
+		if (e->refcnt++ == 0)
+			lc->used++;
+		list_move(&e->list, &lc->in_use); /* Not evictable... */
+	}
+	RETURN(e);
+}
+
+/**
+ * lc_changed - tell @lc that the change has been recorded
+ * @lc: the lru cache to operate on
+ * @e: the element pending label change
+ */
+void lc_changed(struct lru_cache *lc, struct lc_element *e)
+{
+	PARANOIA_ENTRY();
+	BUG_ON(e != lc->changing_element);
+	PARANOIA_LC_ELEMENT(lc, e);
+	++lc->changed;
+	e->lc_number = lc->new_number;
+	list_add(&e->list, &lc->in_use);
+	hlist_add_head(&e->colision, lc_hash_slot(lc, lc->new_number));
+	lc->changing_element = NULL;
+	lc->new_number = LC_FREE;
+	clear_bit(__LC_DIRTY, &lc->flags);
+	smp_mb__after_clear_bit();
+	RETURN();
+}
+
+
+/**
+ * lc_put - give up refcnt of @e
+ * @lc: the lru cache to operate on
+ * @e: the element to put
+ *
+ * If refcnt reaches zero, the element is moved to the lru list,
+ * and a %LC_STARVING (if set) is cleared.
+ * Returns the new (post-decrement) refcnt.
+ */
+unsigned int lc_put(struct lru_cache *lc, struct lc_element *e)
+{
+	PARANOIA_ENTRY();
+	PARANOIA_LC_ELEMENT(lc, e);
+	BUG_ON(e->refcnt == 0);
+	BUG_ON(e == lc->changing_element);
+	if (--e->refcnt == 0) {
+		/* move it to the front of LRU. */
+		list_move(&e->list, &lc->lru);
+		lc->used--;
+		clear_bit(__LC_STARVING, &lc->flags);
+		smp_mb__after_clear_bit();
+	}
+	RETURN(e->refcnt);
+}
+
+/**
+ * lc_element_by_index
+ * @lc: the lru cache to operate on
+ * @i: the index of the element to return
+ */
+struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i)
+{
+	BUG_ON(i >= lc->nr_elements);
+	BUG_ON(lc->lc_element[i] == NULL);
+	BUG_ON(lc->lc_element[i]->lc_index != i);
+	return lc->lc_element[i];
+}
+
+/**
+ * lc_index_of
+ * @lc: the lru cache to operate on
+ * @e: the element to query for its index position in lc->element
+ */
+unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e)
+{
+	PARANOIA_LC_ELEMENT(lc, e);
+	return e->lc_index;
+}
+
+/**
+ * lc_set - associate index with label
+ * @lc: the lru cache to operate on
+ * @enr: the label to set
+ * @index: the element index to associate label with.
+ *
+ * Used to initialize the active set to some previously recorded state.
+ */
+void lc_set(struct lru_cache *lc, unsigned int enr, int index)
+{
+	struct lc_element *e;
+
+	if (index < 0 || index >= lc->nr_elements)
+		return;
+
+	e = lc_element_by_index(lc, index);
+	e->lc_number = enr;
+
+	hlist_del_init(&e->colision);
+	hlist_add_head(&e->colision, lc_hash_slot(lc, enr));
+	list_move(&e->list, e->refcnt ? &lc->in_use : &lc->lru);
+}
+
+/**
+ * lc_dump - Dump a complete LRU cache to seq in textual form.
+ * @lc: the lru cache to operate on
+ * @seq: the &struct seq_file pointer to seq_printf into
+ * @utext: user supplied "heading" or other info
+ * @detail: function pointer the user may provide to dump further details
+ * of the object the lc_element is embedded in.
+ */
+void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
+	     void (*detail) (struct seq_file *, struct lc_element *))
+{
+	unsigned int nr_elements = lc->nr_elements;
+	struct lc_element *e;
+	int i;
+
+	seq_printf(seq, "\tnn: lc_number refcnt %s\n ", utext);
+	for (i = 0; i < nr_elements; i++) {
+		e = lc_element_by_index(lc, i);
+		if (e->lc_number == LC_FREE) {
+			seq_printf(seq, "\t%2d: FREE\n", i);
+		} else {
+			seq_printf(seq, "\t%2d: %4u %4u    ", i,
+				   e->lc_number, e->refcnt);
+			detail(seq, e);
+		}
+	}
+}
+
+EXPORT_SYMBOL(lc_create);
+EXPORT_SYMBOL(lc_reset);
+EXPORT_SYMBOL(lc_destroy);
+EXPORT_SYMBOL(lc_set);
+EXPORT_SYMBOL(lc_del);
+EXPORT_SYMBOL(lc_try_get);
+EXPORT_SYMBOL(lc_find);
+EXPORT_SYMBOL(lc_get);
+EXPORT_SYMBOL(lc_put);
+EXPORT_SYMBOL(lc_changed);
+EXPORT_SYMBOL(lc_element_by_index);
+EXPORT_SYMBOL(lc_index_of);
+EXPORT_SYMBOL(lc_seq_printf_stats);
+EXPORT_SYMBOL(lc_seq_dump_details);
-- 
cgit v1.2.3


From 63312b6a6faae3f2e5577f2b001e3b504f10a2aa Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Fri, 2 Oct 2009 07:50:50 -0700
Subject: x86: Add a Kconfig option to turn the copy_from_user warnings into
 errors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For automated testing it is useful to have the option to turn
the warnings on copy_from_user() etc checks into errors:

 In function ‘copy_from_user’,
     inlined from ‘fd_copyin’ at drivers/block/floppy.c:3080,
     inlined from ‘fd_ioctl’ at drivers/block/floppy.c:3503:
   linux/arch/x86/include/asm/uaccess_32.h:213:
  error: call to ‘copy_from_user_overflow’ declared with attribute error:
  copy_from_user buffer size is not provably correct

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20091002075050.4e9f7641@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.debug            | 14 ++++++++++++++
 arch/x86/include/asm/uaccess_32.h |  4 +++-
 include/linux/compiler-gcc4.h     |  1 +
 include/linux/compiler.h          |  3 +++
 4 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index d105f29bb6bb..1bd2e36f1538 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -287,4 +287,18 @@ config OPTIMIZE_INLINING
 
 	  If unsure, say N.
 
+config DEBUG_STRICT_USER_COPY_CHECKS
+	bool "Strict copy size checks"
+	depends on DEBUG_KERNEL
+	---help---
+	  Enabling this option turns a certain set of sanity checks for user
+	  copy operations into compile time failures.
+
+	  The copy_from_user() etc checks are there to help test if there
+	  are sufficient security checks on the length argument of
+	  the copy operation, by having gcc prove that the argument is
+	  within bounds.
+
+	  If unsure, or if you run an older (pre 4.4) gcc, say N.
+
 endmenu
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 952f9e793c3e..0c9825e97f36 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -193,7 +193,9 @@ unsigned long __must_check _copy_from_user(void *to,
 
 
 extern void copy_from_user_overflow(void)
-#ifdef CONFIG_DEBUG_STACKOVERFLOW
+#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS
+	__compiletime_error("copy_from_user() buffer size is not provably correct")
+#else
 	__compiletime_warning("copy_from_user() buffer size is not provably correct")
 #endif
 ;
diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
index f1709c1f9eae..77542c57e20a 100644
--- a/include/linux/compiler-gcc4.h
+++ b/include/linux/compiler-gcc4.h
@@ -41,4 +41,5 @@
 #define __compiletime_object_size(obj) __builtin_object_size(obj, 0)
 #if __GNUC_MINOR__ >= 4
 #define __compiletime_warning(message) __attribute__((warning(message)))
+#define __compiletime_error(message) __attribute__((error(message)))
 #endif
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 950356311f12..88fd4b673cb4 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -273,6 +273,9 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 #ifndef __compiletime_warning
 # define __compiletime_warning(message)
 #endif
+#ifndef __compiletime_error
+# define __compiletime_error(message)
+#endif
 
 /*
  * Prevent the compiler from merging or refetching accesses.  The compiler
-- 
cgit v1.2.3


From a9828ec6bc0b7e19a65f7e13daa8bd35a926a753 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Thu, 1 Oct 2009 11:33:03 +0000
Subject: ethtool: Remove support for obsolete string query operations

The in-tree implementations have all been converted to
get_sset_count().

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h |  4 ----
 net/core/ethtool.c      | 58 +++++++++----------------------------------------
 2 files changed, 10 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 15e4eb713694..aa0dcb3833d1 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -495,10 +495,6 @@ struct ethtool_ops {
 	u32     (*get_priv_flags)(struct net_device *);
 	int     (*set_priv_flags)(struct net_device *, u32);
 	int	(*get_sset_count)(struct net_device *, int);
-
-	/* the following hooks are obsolete */
-	int	(*self_test_count)(struct net_device *);/* use get_sset_count */
-	int	(*get_stats_count)(struct net_device *);/* use get_sset_count */
 	int	(*get_rxnfc)(struct net_device *, struct ethtool_rxnfc *, void *);
 	int	(*set_rxnfc)(struct net_device *, struct ethtool_rxnfc *);
 	int     (*flash_device)(struct net_device *, struct ethtool_flash *);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 4c12ddb5f5ee..e1951084b973 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -198,13 +198,6 @@ static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr)
 		rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS);
 		if (rc >= 0)
 			info.n_priv_flags = rc;
-	} else {
-		/* code path for obsolete hooks */
-
-		if (ops->self_test_count)
-			info.testinfo_len = ops->self_test_count(dev);
-		if (ops->get_stats_count)
-			info.n_stats = ops->get_stats_count(dev);
 	}
 	if (ops->get_regs_len)
 		info.regdump_len = ops->get_regs_len(dev);
@@ -684,16 +677,10 @@ static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
 	u64 *data;
 	int ret, test_len;
 
-	if (!ops->self_test)
-		return -EOPNOTSUPP;
-	if (!ops->get_sset_count && !ops->self_test_count)
+	if (!ops->self_test || !ops->get_sset_count)
 		return -EOPNOTSUPP;
 
-	if (ops->get_sset_count)
-		test_len = ops->get_sset_count(dev, ETH_SS_TEST);
-	else
-		/* code path for obsolete hook */
-		test_len = ops->self_test_count(dev);
+	test_len = ops->get_sset_count(dev, ETH_SS_TEST);
 	if (test_len < 0)
 		return test_len;
 	WARN_ON(test_len == 0);
@@ -728,36 +715,17 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
 	u8 *data;
 	int ret;
 
-	if (!ops->get_strings)
+	if (!ops->get_strings || !ops->get_sset_count)
 		return -EOPNOTSUPP;
 
 	if (copy_from_user(&gstrings, useraddr, sizeof(gstrings)))
 		return -EFAULT;
 
-	if (ops->get_sset_count) {
-		ret = ops->get_sset_count(dev, gstrings.string_set);
-		if (ret < 0)
-			return ret;
-
-		gstrings.len = ret;
-	} else {
-		/* code path for obsolete hooks */
-
-		switch (gstrings.string_set) {
-		case ETH_SS_TEST:
-			if (!ops->self_test_count)
-				return -EOPNOTSUPP;
-			gstrings.len = ops->self_test_count(dev);
-			break;
-		case ETH_SS_STATS:
-			if (!ops->get_stats_count)
-				return -EOPNOTSUPP;
-			gstrings.len = ops->get_stats_count(dev);
-			break;
-		default:
-			return -EINVAL;
-		}
-	}
+	ret = ops->get_sset_count(dev, gstrings.string_set);
+	if (ret < 0)
+		return ret;
+
+	gstrings.len = ret;
 
 	data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER);
 	if (!data)
@@ -798,16 +766,10 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
 	u64 *data;
 	int ret, n_stats;
 
-	if (!ops->get_ethtool_stats)
-		return -EOPNOTSUPP;
-	if (!ops->get_sset_count && !ops->get_stats_count)
+	if (!ops->get_ethtool_stats || !ops->get_sset_count)
 		return -EOPNOTSUPP;
 
-	if (ops->get_sset_count)
-		n_stats = ops->get_sset_count(dev, ETH_SS_STATS);
-	else
-		/* code path for obsolete hook */
-		n_stats = ops->get_stats_count(dev);
+	n_stats = ops->get_sset_count(dev, ETH_SS_STATS);
 	if (n_stats < 0)
 		return n_stats;
 	WARN_ON(n_stats == 0);
-- 
cgit v1.2.3


From 977750076d98c7ff6cbda51858bb5a5894a9d9ab Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Fri, 2 Oct 2009 06:56:41 +0000
Subject: af_packet: add interframe drop cmsg (v6)

Add Ancilliary data to better represent loss information

I've had a few requests recently to provide more detail regarding frame loss
during an AF_PACKET packet capture session.  Specifically the requestors want to
see where in a packet sequence frames were lost, i.e. they want to see that 40
frames were lost between frames 302 and 303 in a packet capture file.  In order
to do this we need:

1) The kernel to export this data to user space
2) The applications to make use of it

This patch addresses item (1).  It does this by doing the following:

A) Anytime we drop a frame for which we would increment po->stats.tp_drops, we
also no increment a stats called po->stats.tp_gap.

B) Every time we successfully enqueue a frame to sk_receive_queue, we record the
value of po->stats.tp_gap in skb->mark.  skb->cb would nominally be the place to
record this, but since all the space there is used up, we're overloading
skb->mark.  Its safe to do since any enqueued packet is guaranteed to be
unshared at this point, and skb->mark isn't used for anything else in the rx
path to the application.  After we record tp_gap in the skb, we zero
po->stats.tp_gap.  This allows us to keep a counter of the number of frames lost
between any two enqueued packets

C) When the application goes to dequeue a frame from the packet socket, we look
at skb->mark for that frame.  If it is non-zero, we add a cmsg chunk to the
msghdr of level SOL_PACKET and type PACKET_GAPDATA.  Its a 32 bit integer that
represents the number of frames lost between this packet and the last previous
frame received.

Note there is a chance that if there is frame loss after a receive, and then the
socket is closed, some gap data might be lost.  This is covered by the use of
the PACKET_AUXDATA socket option, which gives total loss data.  With a bit of
math, the final gap can be determined that way.

I've tested this patch myself, and it works well.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

 include/linux/if_packet.h |    2 ++
 net/packet/af_packet.c    |   33 +++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_packet.h |  2 ++
 net/packet/af_packet.c    | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
index dea7d6b7cf98..e5d200f53fc3 100644
--- a/include/linux/if_packet.h
+++ b/include/linux/if_packet.h
@@ -48,11 +48,13 @@ struct sockaddr_ll
 #define PACKET_RESERVE			12
 #define PACKET_TX_RING			13
 #define PACKET_LOSS			14
+#define PACKET_GAPDATA			15
 
 struct tpacket_stats
 {
 	unsigned int	tp_packets;
 	unsigned int	tp_drops;
+	unsigned int    tp_gap;
 };
 
 struct tpacket_auxdata
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index d7ecca0a0c07..d398a9bf6903 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -523,6 +523,31 @@ static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
 	return res;
 }
 
+/*
+ * If we've lost frames since the last time we queued one to the
+ * sk_receive_queue, we need to record it here.
+ * This must be called under the protection of the socket lock
+ * to prevent racing with other softirqs and user space
+ */
+static inline void record_packet_gap(struct sk_buff *skb,
+					struct packet_sock *po)
+{
+	/*
+	 * We overload the mark field here, since we're about
+	 * to enqueue to a receive queue and no body else will
+	 * use this field at this point
+	 */
+	skb->mark = po->stats.tp_gap;
+	po->stats.tp_gap = 0;
+	return;
+
+}
+
+static inline __u32 check_packet_gap(struct sk_buff *skb)
+{
+	return skb->mark;
+}
+
 /*
    This function makes lazy skb cloning in hope that most of packets
    are discarded by BPF.
@@ -626,6 +651,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
 
 	spin_lock(&sk->sk_receive_queue.lock);
 	po->stats.tp_packets++;
+	record_packet_gap(skb, po);
 	__skb_queue_tail(&sk->sk_receive_queue, skb);
 	spin_unlock(&sk->sk_receive_queue.lock);
 	sk->sk_data_ready(sk, skb->len);
@@ -634,6 +660,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
 drop_n_acct:
 	spin_lock(&sk->sk_receive_queue.lock);
 	po->stats.tp_drops++;
+	po->stats.tp_gap++;
 	spin_unlock(&sk->sk_receive_queue.lock);
 
 drop_n_restore:
@@ -811,6 +838,7 @@ drop:
 
 ring_is_full:
 	po->stats.tp_drops++;
+	po->stats.tp_gap++;
 	spin_unlock(&sk->sk_receive_queue.lock);
 
 	sk->sk_data_ready(sk, 0);
@@ -1418,6 +1446,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 	struct sk_buff *skb;
 	int copied, err;
 	struct sockaddr_ll *sll;
+	__u32 gap;
 
 	err = -EINVAL;
 	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
@@ -1496,6 +1525,10 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
 	}
 
+	gap = check_packet_gap(skb);
+	if (gap)
+		put_cmsg(msg, SOL_PACKET, PACKET_GAPDATA, sizeof(__u32), &gap);
+
 	/*
 	 *	Free or return the buffer as appropriate. Again this
 	 *	hides all the races and re-entrancy issues from us.
-- 
cgit v1.2.3


From e1e499eef2200c2a7120c9ebf297d48b195cf887 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Fri, 2 Oct 2009 05:15:25 +0000
Subject: usbnet: Use wwan%d interface name for mobile broadband devices

Add support for usbnet based devices like CDC-Ether to indicate that they
are actually mobile broadband devices. In that case use wwan%d as default
interface name.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/cdc_ether.c | 20 ++++++++++++++------
 drivers/net/usb/usbnet.c    |  3 +++
 include/linux/usb/usbnet.h  |  1 +
 3 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c
index 4a6aff579403..71e65fc10e6f 100644
--- a/drivers/net/usb/cdc_ether.c
+++ b/drivers/net/usb/cdc_ether.c
@@ -420,6 +420,14 @@ static const struct driver_info	cdc_info = {
 	.status =	cdc_status,
 };
 
+static const struct driver_info mbm_info = {
+	.description =	"Mobile Broadband Network Device",
+	.flags =	FLAG_WWAN,
+	.bind = 	cdc_bind,
+	.unbind =	usbnet_cdc_unbind,
+	.status =	cdc_status,
+};
+
 /*-------------------------------------------------------------------------*/
 
 
@@ -532,32 +540,32 @@ static const struct usb_device_id	products [] = {
 	/* Ericsson F3507g */
 	USB_DEVICE_AND_INTERFACE_INFO(0x0bdb, 0x1900, USB_CLASS_COMM,
 			USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE),
-	.driver_info = (unsigned long) &cdc_info,
+	.driver_info = (unsigned long) &mbm_info,
 }, {
 	/* Ericsson F3507g ver. 2 */
 	USB_DEVICE_AND_INTERFACE_INFO(0x0bdb, 0x1902, USB_CLASS_COMM,
 			USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE),
-	.driver_info = (unsigned long) &cdc_info,
+	.driver_info = (unsigned long) &mbm_info,
 }, {
 	/* Ericsson F3607gw */
 	USB_DEVICE_AND_INTERFACE_INFO(0x0bdb, 0x1904, USB_CLASS_COMM,
 			USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE),
-	.driver_info = (unsigned long) &cdc_info,
+	.driver_info = (unsigned long) &mbm_info,
 }, {
 	/* Ericsson F3307 */
 	USB_DEVICE_AND_INTERFACE_INFO(0x0bdb, 0x1906, USB_CLASS_COMM,
 			USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE),
-	.driver_info = (unsigned long) &cdc_info,
+	.driver_info = (unsigned long) &mbm_info,
 }, {
 	/* Toshiba F3507g */
 	USB_DEVICE_AND_INTERFACE_INFO(0x0930, 0x130b, USB_CLASS_COMM,
 			USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE),
-	.driver_info = (unsigned long) &cdc_info,
+	.driver_info = (unsigned long) &mbm_info,
 }, {
 	/* Dell F3507g */
 	USB_DEVICE_AND_INTERFACE_INFO(0x413c, 0x8147, USB_CLASS_COMM,
 			USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE),
-	.driver_info = (unsigned long) &cdc_info,
+	.driver_info = (unsigned long) &mbm_info,
 },
 	{ },		// END
 };
diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index ca5ca5ae061d..8124cf16259f 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -1295,6 +1295,9 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod)
 		/* WLAN devices should always be named "wlan%d" */
 		if ((dev->driver_info->flags & FLAG_WLAN) != 0)
 			strcpy(net->name, "wlan%d");
+		/* WWAN devices should always be named "wwan%d" */
+		if ((dev->driver_info->flags & FLAG_WWAN) != 0)
+			strcpy(net->name, "wwan%d");
 
 		/* maybe the remote can't receive an Ethernet MTU */
 		if (net->mtu > (dev->hard_mtu - net->hard_header_len))
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index f81473052059..86c31b753266 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -90,6 +90,7 @@ struct driver_info {
 #define FLAG_WLAN	0x0080		/* use "wlan%d" names */
 #define FLAG_AVOID_UNLINK_URBS 0x0100	/* don't unlink urbs at usbnet_stop() */
 #define FLAG_SEND_ZLP	0x0200		/* hw requires ZLPs are sent */
+#define FLAG_WWAN	0x0400		/* use "wwan%d" names */
 
 
 	/* init device ... can sleep, or cause probe() failure */
-- 
cgit v1.2.3


From 7ffbe3fdace0bdfcdab8dc6c77506feda0871f79 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Fri, 2 Oct 2009 05:15:27 +0000
Subject: net: introduce NETDEV_POST_INIT notifier

For various purposes including a wireless extensions
bugfix, we need to hook into the netdev creation before
before netdev_register_kobject(). This will also ease
doing the dev type assignment that Marcel was working
on for cfg80211 drivers w/o touching them all.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/notifier.h | 1 +
 net/core/dev.c           | 6 ++++++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 44428d247dbe..29714b8441b1 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -201,6 +201,7 @@ static inline int notifier_to_errno(int ret)
 #define NETDEV_PRE_UP		0x000D
 #define NETDEV_BONDING_OLDTYPE  0x000E
 #define NETDEV_BONDING_NEWTYPE  0x000F
+#define NETDEV_POST_INIT	0x0010
 
 #define SYS_DOWN	0x0001	/* Notify of system down */
 #define SYS_RESTART	SYS_DOWN
diff --git a/net/core/dev.c b/net/core/dev.c
index b8f74cfb1bfd..a74c8fd69556 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4836,6 +4836,12 @@ int register_netdevice(struct net_device *dev)
 		dev->features |= NETIF_F_GSO;
 
 	netdev_initialize_kobject(dev);
+
+	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
+	ret = notifier_to_errno(ret);
+	if (ret)
+		goto err_uninit;
+
 	ret = netdev_register_kobject(dev);
 	if (ret)
 		goto err_uninit;
-- 
cgit v1.2.3


From 9f5180e5c331d7b3ccc35e1a78072235d38f9f34 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Tue, 6 Oct 2009 09:30:14 +0200
Subject: drbd: Work on permission enforcement

Now we have the capabilities of the sending process available,
use them to enforce CAP_SYS_ADMIN.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/drbd/drbd_nl.c | 7 ++++++-
 include/linux/drbd.h         | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 73c55ccb629a..22538d9628f1 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -2000,7 +2000,7 @@ static struct cn_handler_struct cnd_table[] = {
 	[ P_new_c_uuid ]	= { &drbd_nl_new_c_uuid,	0 },
 };
 
-static void drbd_connector_callback(struct cn_msg *req)
+static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms *nsp)
 {
 	struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data;
 	struct cn_handler_struct *cm;
@@ -2017,6 +2017,11 @@ static void drbd_connector_callback(struct cn_msg *req)
 		return;
 	}
 
+	if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN)) {
+		retcode = ERR_PERM;
+		goto fail;
+	}
+
 	mdev = ensure_mdev(nlp);
 	if (!mdev) {
 		retcode = ERR_MINOR_INVALID;
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 69dc711f37b3..233db5c18b86 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -138,6 +138,7 @@ enum drbd_ret_codes {
 	ERR_VERIFY_RUNNING	= 149, /* DRBD 8.2 only */
 	ERR_DATA_NOT_CURRENT	= 150,
 	ERR_CONNECTED		= 151, /* DRBD 8.3 only */
+	ERR_PERM		= 152,
 
 	/* insert new ones above this line */
 	AFTER_LAST_ERR_CODE
-- 
cgit v1.2.3


From ee5e81f00051b5c373c8de16e3604fd6d3be699e Mon Sep 17 00:00:00 2001
From: Ilia K <mail4ilia@gmail.com>
Date: Wed, 16 Sep 2009 05:53:07 +0000
Subject: add vif using local interface index instead of IP

When routing daemon wants to enable forwarding of multicast traffic it
performs something like:

       struct vifctl vc = {
               .vifc_vifi  = 1,
               .vifc_flags = 0,
               .vifc_threshold = 1,
               .vifc_rate_limit = 0,
               .vifc_lcl_addr = ip, /* <--- ip address of physical
interface, e.g. eth0 */
               .vifc_rmt_addr.s_addr = htonl(INADDR_ANY),
         };
       setsockopt(fd, IPPROTO_IP, MRT_ADD_VIF, &vc, sizeof(vc));

This leads (in the kernel) to calling  vif_add() function call which
search the (physical) device using assigned IP address:
       dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);

The current API (struct vifctl) does not allow to specify an
interface other way than using it's IP, and if there are more than a
single interface with specified IP only the first one will be found.

The attached patch (against 2.6.30.4) allows to specify an interface
by its index, instead of IP address:

       struct vifctl vc = {
               .vifc_vifi  = 1,
               .vifc_flags = VIFF_USE_IFINDEX,   /* NEW */
               .vifc_threshold = 1,
               .vifc_rate_limit = 0,
               .vifc_lcl_ifindex = if_nametoindex("eth0"),   /* NEW */
               .vifc_rmt_addr.s_addr = htonl(INADDR_ANY),
         };
       setsockopt(fd, IPPROTO_IP, MRT_ADD_VIF, &vc, sizeof(vc));

Signed-off-by: Ilia K. <mail4ilia@gmail.com>

=== modified file 'include/linux/mroute.h'
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h | 13 +++++++++----
 net/ipv4/ipmr.c        | 12 +++++++++++-
 2 files changed, 20 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 08bc776d05e2..d5f69151f692 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -59,13 +59,18 @@ struct vifctl {
 	unsigned char vifc_flags;	/* VIFF_ flags */
 	unsigned char vifc_threshold;	/* ttl limit */
 	unsigned int vifc_rate_limit;	/* Rate limiter values (NI) */
-	struct in_addr vifc_lcl_addr;	/* Our address */
+	union {
+		struct in_addr vifc_lcl_addr;     /* Local interface address */
+		int            vifc_lcl_ifindex;  /* Local interface index   */
+	};
 	struct in_addr vifc_rmt_addr;	/* IPIP tunnel addr */
 };
 
-#define VIFF_TUNNEL	0x1	/* IPIP tunnel */
-#define VIFF_SRCRT	0x2	/* NI */
-#define VIFF_REGISTER	0x4	/* register vif	*/
+#define VIFF_TUNNEL		0x1	/* IPIP tunnel */
+#define VIFF_SRCRT		0x2	/* NI */
+#define VIFF_REGISTER		0x4	/* register vif	*/
+#define VIFF_USE_IFINDEX	0x8	/* use vifc_lcl_ifindex instead of
+					   vifc_lcl_addr to find an interface */
 
 /*
  *	Cache manipulation structures for mrouted and PIMd
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 630a56df7b47..c757f0b4b74c 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -469,8 +469,18 @@ static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock)
 			return err;
 		}
 		break;
+
+	case VIFF_USE_IFINDEX:
 	case 0:
-		dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
+		if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
+			dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
+			if (dev && dev->ip_ptr == NULL) {
+				dev_put(dev);
+				return -EADDRNOTAVAIL;
+			}
+		} else
+			dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
+
 		if (!dev)
 			return -EADDRNOTAVAIL;
 		err = dev_set_allmulti(dev, 1);
-- 
cgit v1.2.3


From fa857afcf77da669eb6b7031ec07ad14b912c307 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki / 吉藤英明 <yoshfuji@linux-ipv6.org>
Date: Tue, 22 Sep 2009 23:43:14 +0000
Subject: ipv6 sit: 6rd (IPv6 Rapid Deployment) Support.

IPv6 Rapid Deployment (6rd; draft-ietf-softwire-ipv6-6rd) builds upon
mechanisms of 6to4 (RFC3056) to enable a service provider to rapidly
deploy IPv6 unicast service to IPv4 sites to which it provides
customer premise equipment.  Like 6to4, it utilizes stateless IPv6 in
IPv4 encapsulation in order to transit IPv4-only network
infrastructure.  Unlike 6to4, a 6rd service provider uses an IPv6
prefix of its own in place of the fixed 6to4 prefix.

With this option enabled, the SIT driver offers 6rd functionality by
providing additional ioctl API to configure the IPv6 Prefix for in
stead of static 2002::/16 for 6to4.

Original patch was done by Alexandre Cassen <acassen@freebox.fr>
based on old Internet-Draft.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_tunnel.h |  11 ++++
 include/net/ipip.h        |  13 +++++
 net/ipv6/Kconfig          |  19 +++++++
 net/ipv6/sit.c            | 124 +++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 159 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_tunnel.h b/include/linux/if_tunnel.h
index 5a9aae4adb44..c53c8e016940 100644
--- a/include/linux/if_tunnel.h
+++ b/include/linux/if_tunnel.h
@@ -15,6 +15,10 @@
 #define SIOCADDPRL      (SIOCDEVPRIVATE + 5)
 #define SIOCDELPRL      (SIOCDEVPRIVATE + 6)
 #define SIOCCHGPRL      (SIOCDEVPRIVATE + 7)
+#define SIOCGET6RD      (SIOCDEVPRIVATE + 8)
+#define SIOCADD6RD      (SIOCDEVPRIVATE + 9)
+#define SIOCDEL6RD      (SIOCDEVPRIVATE + 10)
+#define SIOCCHG6RD      (SIOCDEVPRIVATE + 11)
 
 #define GRE_CSUM	__cpu_to_be16(0x8000)
 #define GRE_ROUTING	__cpu_to_be16(0x4000)
@@ -51,6 +55,13 @@ struct ip_tunnel_prl {
 /* PRL flags */
 #define	PRL_DEFAULT		0x0001
 
+struct ip_tunnel_6rd {
+	struct in6_addr		prefix;
+	__be32			relay_prefix;
+	__u16			prefixlen;
+	__u16			relay_prefixlen;
+};
+
 enum
 {
 	IFLA_GRE_UNSPEC,
diff --git a/include/net/ipip.h b/include/net/ipip.h
index 0159221a8509..86f1c8bd040c 100644
--- a/include/net/ipip.h
+++ b/include/net/ipip.h
@@ -7,6 +7,15 @@
 /* Keep error state on tunnel for 30 sec */
 #define IPTUNNEL_ERR_TIMEO	(30*HZ)
 
+/* 6rd prefix/relay information */
+struct ip_tunnel_6rd_parm
+{
+	struct in6_addr		prefix;
+	__be32			relay_prefix;
+	u16			prefixlen;
+	u16			relay_prefixlen;
+};
+
 struct ip_tunnel
 {
 	struct ip_tunnel	*next;
@@ -23,6 +32,10 @@ struct ip_tunnel
 
 	struct ip_tunnel_parm	parms;
 
+	/* for SIT */
+#ifdef CONFIG_IPV6_SIT_6RD
+	struct ip_tunnel_6rd_parm	ip6rd;
+#endif
 	struct ip_tunnel_prl_entry	*prl;		/* potential router list */
 	unsigned int			prl_count;	/* # of entries in PRL */
 };
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index ead6c7a42f44..f56199827452 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -170,6 +170,25 @@ config IPV6_SIT
 
 	  Saying M here will produce a module called sit. If unsure, say Y.
 
+config IPV6_SIT_6RD
+	bool "IPv6: IPv6 Rapid Development (6RD) (EXPERIMENTAL)"
+	depends on IPV6_SIT && EXPERIMENTAL
+	default n
+	---help---
+	  IPv6 Rapid Deployment (6rd; draft-ietf-softwire-ipv6-6rd) builds upon
+	  mechanisms of 6to4 (RFC3056) to enable a service provider to rapidly
+	  deploy IPv6 unicast service to IPv4 sites to which it provides
+	  customer premise equipment.  Like 6to4, it utilizes stateless IPv6 in
+	  IPv4 encapsulation in order to transit IPv4-only network
+	  infrastructure.  Unlike 6to4, a 6rd service provider uses an IPv6
+	  prefix of its own in place of the fixed 6to4 prefix.
+
+	  With this option enabled, the SIT driver offers 6rd functionality by
+	  providing additional ioctl API to configure the IPv6 Prefix for in
+	  stead of static 2002::/16 for 6to4.
+
+	  If unsure, say N.
+
 config IPV6_NDISC_NODETYPE
 	bool
 
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 99da272951dc..6955654262a5 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -161,6 +161,21 @@ static void ipip6_tunnel_link(struct sit_net *sitn, struct ip_tunnel *t)
 	write_unlock_bh(&ipip6_lock);
 }
 
+static void ipip6_tunnel_clone_6rd(struct ip_tunnel *t, struct sit_net *sitn)
+{
+#ifdef CONFIG_IPV6_SIT_6RD
+	if (t->dev == sitn->fb_tunnel_dev) {
+		ipv6_addr_set(&t->ip6rd.prefix, htonl(0x20020000), 0, 0, 0);
+		t->ip6rd.relay_prefix = 0;
+		t->ip6rd.prefixlen = 16;
+		t->ip6rd.relay_prefixlen = 0;
+	} else {
+		struct ip_tunnel *t0 = netdev_priv(sitn->fb_tunnel_dev);
+		memcpy(&t->ip6rd, &t0->ip6rd, sizeof(t->ip6rd));
+	}
+#endif
+}
+
 static struct ip_tunnel * ipip6_tunnel_locate(struct net *net,
 		struct ip_tunnel_parm *parms, int create)
 {
@@ -213,6 +228,8 @@ static struct ip_tunnel * ipip6_tunnel_locate(struct net *net,
 
 	dev_hold(dev);
 
+	ipip6_tunnel_clone_6rd(t, sitn);
+
 	ipip6_tunnel_link(sitn, nt);
 	return nt;
 
@@ -532,17 +549,41 @@ out:
 	return 0;
 }
 
-/* Returns the embedded IPv4 address if the IPv6 address
-   comes from 6to4 (RFC 3056) addr space */
-
-static inline __be32 try_6to4(struct in6_addr *v6dst)
+/*
+ * Returns the embedded IPv4 address if the IPv6 address
+ * comes from 6rd / 6to4 (RFC 3056) addr space.
+ */
+static inline
+__be32 try_6rd(struct in6_addr *v6dst, struct ip_tunnel *tunnel)
 {
 	__be32 dst = 0;
 
+#ifdef CONFIG_IPV6_SIT_6RD
+	if (ipv6_prefix_equal(v6dst, &tunnel->ip6rd.prefix,
+			      tunnel->ip6rd.prefixlen)) {
+		unsigned pbw0, pbi0;
+		int pbi1;
+		u32 d;
+
+		pbw0 = tunnel->ip6rd.prefixlen >> 5;
+		pbi0 = tunnel->ip6rd.prefixlen & 0x1f;
+
+		d = (ntohl(tunnel->ip6rd.prefix.s6_addr32[pbw0]) << pbi0) >>
+		    tunnel->ip6rd.relay_prefixlen;
+
+		pbi1 = pbi0 - tunnel->ip6rd.relay_prefixlen;
+		if (pbi1 > 0)
+			d |= ntohl(tunnel->ip6rd.prefix.s6_addr32[pbw0 + 1]) >>
+			     (32 - pbi1);
+
+		dst = tunnel->ip6rd.relay_prefix | htonl(d);
+	}
+#else
 	if (v6dst->s6_addr16[0] == htons(0x2002)) {
 		/* 6to4 v6 addr has 16 bits prefix, 32 v4addr, 16 SLA, ... */
 		memcpy(&dst, &v6dst->s6_addr16[1], 4);
 	}
+#endif
 	return dst;
 }
 
@@ -596,7 +637,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 	}
 
 	if (!dst)
-		dst = try_6to4(&iph6->daddr);
+		dst = try_6rd(&iph6->daddr, tunnel);
 
 	if (!dst) {
 		struct neighbour *neigh = NULL;
@@ -786,9 +827,15 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 	struct ip_tunnel *t;
 	struct net *net = dev_net(dev);
 	struct sit_net *sitn = net_generic(net, sit_net_id);
+#ifdef CONFIG_IPV6_SIT_6RD
+	struct ip_tunnel_6rd ip6rd;
+#endif
 
 	switch (cmd) {
 	case SIOCGETTUNNEL:
+#ifdef CONFIG_IPV6_SIT_6RD
+	case SIOCGET6RD:
+#endif
 		t = NULL;
 		if (dev == sitn->fb_tunnel_dev) {
 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
@@ -799,9 +846,25 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 		}
 		if (t == NULL)
 			t = netdev_priv(dev);
-		memcpy(&p, &t->parms, sizeof(p));
-		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
-			err = -EFAULT;
+
+		err = -EFAULT;
+		if (cmd == SIOCGETTUNNEL) {
+			memcpy(&p, &t->parms, sizeof(p));
+			if (copy_to_user(ifr->ifr_ifru.ifru_data, &p,
+					 sizeof(p)))
+				goto done;
+#ifdef CONFIG_IPV6_SIT_6RD
+		} else {
+			ipv6_addr_copy(&ip6rd.prefix, &t->ip6rd.prefix);
+			ip6rd.relay_prefix = t->ip6rd.relay_prefix;
+			ip6rd.prefixlen = t->ip6rd.prefixlen;
+			ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen;
+			if (copy_to_user(ifr->ifr_ifru.ifru_data, &ip6rd,
+					 sizeof(ip6rd)))
+				goto done;
+#endif
+		}
+		err = 0;
 		break;
 
 	case SIOCADDTUNNEL:
@@ -922,6 +985,51 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 		netdev_state_change(dev);
 		break;
 
+#ifdef CONFIG_IPV6_SIT_6RD
+	case SIOCADD6RD:
+	case SIOCCHG6RD:
+	case SIOCDEL6RD:
+		err = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			goto done;
+
+		err = -EFAULT;
+		if (copy_from_user(&ip6rd, ifr->ifr_ifru.ifru_data,
+				   sizeof(ip6rd)))
+			goto done;
+
+		t = netdev_priv(dev);
+
+		if (cmd != SIOCDEL6RD) {
+			struct in6_addr prefix;
+			__be32 relay_prefix;
+
+			err = -EINVAL;
+			if (ip6rd.relay_prefixlen > 32 ||
+			    ip6rd.prefixlen + (32 - ip6rd.relay_prefixlen) > 64)
+				goto done;
+
+			ipv6_addr_prefix(&prefix, &ip6rd.prefix,
+					 ip6rd.prefixlen);
+			if (!ipv6_addr_equal(&prefix, &ip6rd.prefix))
+				goto done;
+			relay_prefix = ip6rd.relay_prefix &
+				       htonl(0xffffffffUL <<
+					     (32 - ip6rd.relay_prefixlen));
+			if (relay_prefix != ip6rd.relay_prefix)
+				goto done;
+
+			ipv6_addr_copy(&t->ip6rd.prefix, &prefix);
+			t->ip6rd.relay_prefix = relay_prefix;
+			t->ip6rd.prefixlen = ip6rd.prefixlen;
+			t->ip6rd.relay_prefixlen = ip6rd.relay_prefixlen;
+		} else
+			ipip6_tunnel_clone_6rd(t, sitn);
+
+		err = 0;
+		break;
+#endif
+
 	default:
 		err = -EINVAL;
 	}
-- 
cgit v1.2.3


From d73d3a8cb4723e161589864741d8528d70b350eb Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Mon, 5 Oct 2009 10:59:58 +0000
Subject: ethtool: Add reset operation

After updating firmware stored in flash, users may wish to reset the
relevant hardware and start the new firmware immediately.  This should
not be completely automatic as it may be disruptive.

A selective reset may also be useful for debugging or diagnostics.

This adds a separate reset operation which takes flags indicating the
components to be reset.  Drivers are allowed to reset only a subset of
those requested, and must indicate the actual subset.  This allows the
use of generic component masks and some future expansion.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h | 32 ++++++++++++++++++++++++++++++++
 net/core/ethtool.c      | 23 +++++++++++++++++++++++
 2 files changed, 55 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index aa0dcb3833d1..eb1a48da2d43 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -498,6 +498,7 @@ struct ethtool_ops {
 	int	(*get_rxnfc)(struct net_device *, struct ethtool_rxnfc *, void *);
 	int	(*set_rxnfc)(struct net_device *, struct ethtool_rxnfc *);
 	int     (*flash_device)(struct net_device *, struct ethtool_flash *);
+	int	(*reset)(struct net_device *, u32 *);
 };
 #endif /* __KERNEL__ */
 
@@ -555,6 +556,7 @@ struct ethtool_ops {
 #define	ETHTOOL_SRXCLSRLDEL	0x00000031 /* Delete RX classification rule */
 #define	ETHTOOL_SRXCLSRLINS	0x00000032 /* Insert RX classification rule */
 #define	ETHTOOL_FLASHDEV	0x00000033 /* Flash firmware to device */
+#define	ETHTOOL_RESET		0x00000034 /* Reset hardware */
 
 /* compatibility with older code */
 #define SPARC_ETH_GSET		ETHTOOL_GSET
@@ -685,4 +687,34 @@ struct ethtool_ops {
 
 #define	RX_CLS_FLOW_DISC	0xffffffffffffffffULL
 
+/* Reset flags */
+/* The reset() operation must clear the flags for the components which
+ * were actually reset.  On successful return, the flags indicate the
+ * components which were not reset, either because they do not exist
+ * in the hardware or because they cannot be reset independently.  The
+ * driver must never reset any components that were not requested.
+ */
+enum ethtool_reset_flags {
+	/* These flags represent components dedicated to the interface
+	 * the command is addressed to.  Shift any flag left by
+	 * ETH_RESET_SHARED_SHIFT to reset a shared component of the
+	 * same type.
+	 */
+	ETH_RESET_MGMT		= 1 << 0,	/* Management processor */
+	ETH_RESET_IRQ		= 1 << 1,	/* Interrupt requester */
+	ETH_RESET_DMA		= 1 << 2,	/* DMA engine */
+	ETH_RESET_FILTER	= 1 << 3,	/* Filtering/flow direction */
+	ETH_RESET_OFFLOAD	= 1 << 4,	/* Protocol offload */
+	ETH_RESET_MAC		= 1 << 5,	/* Media access controller */
+	ETH_RESET_PHY		= 1 << 6,	/* Transceiver/PHY */
+	ETH_RESET_RAM		= 1 << 7,	/* RAM shared between
+						 * multiple components */
+
+	ETH_RESET_DEDICATED	= 0x0000ffff,	/* All components dedicated to
+						 * this interface */
+	ETH_RESET_ALL		= 0xffffffff,	/* All components used by this
+						 * interface, even if shared */
+};
+#define ETH_RESET_SHARED_SHIFT	16
+
 #endif /* _LINUX_ETHTOOL_H */
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index e1951084b973..d8aee584e8d1 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -302,6 +302,26 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
 	return ret;
 }
 
+static int ethtool_reset(struct net_device *dev, char __user *useraddr)
+{
+	struct ethtool_value reset;
+	int ret;
+
+	if (!dev->ethtool_ops->reset)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&reset, useraddr, sizeof(reset)))
+		return -EFAULT;
+
+	ret = dev->ethtool_ops->reset(dev, &reset.data);
+	if (ret)
+		return ret;
+
+	if (copy_to_user(useraddr, &reset, sizeof(reset)))
+		return -EFAULT;
+	return 0;
+}
+
 static int ethtool_get_wol(struct net_device *dev, char __user *useraddr)
 {
 	struct ethtool_wolinfo wol = { ETHTOOL_GWOL };
@@ -1089,6 +1109,9 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_FLASHDEV:
 		rc = ethtool_flash_device(dev, useraddr);
 		break;
+	case ETHTOOL_RESET:
+		rc = ethtool_reset(dev, useraddr);
+		break;
 	default:
 		rc = -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3


From f7734fdf61ec6bb848e0bafc1fb8bad2c124bb50 Mon Sep 17 00:00:00 2001
From: Octavian Purdila <opurdila@ixiacom.com>
Date: Fri, 2 Oct 2009 11:39:15 +0000
Subject: make TLLAO option for NA packets configurable

On Friday 02 October 2009 20:53:51 you wrote:

> This is good although I would have shortened the name.

Ah, I knew I forgot something :) Here is v4.

tavi

>From 24d96d825b9fa832b22878cc6c990d5711968734 Mon Sep 17 00:00:00 2001
From: Octavian Purdila <opurdila@ixiacom.com>
Date: Fri, 2 Oct 2009 00:51:15 +0300
Subject: [PATCH] ipv6: new sysctl for sending TLLAO with unicast NAs

Neighbor advertisements responding to unicast neighbor solicitations
did not include the target link-layer address option. This patch adds
a new sysctl option (disabled by default) which controls whether this
option should be sent even with unicast NAs.

The need for this arose because certain routers expect the TLLAO in
some situations even as a response to unicast NS packets.

Moreover, RFC 2461 recommends sending this to avoid a race condition
(section 4.4, Target link-layer address)

Signed-off-by: Cosmin Ratiu <cratiu@ixiacom.com>
Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 18 ++++++++++++++++++
 include/linux/ipv6.h                   |  1 +
 net/ipv6/addrconf.c                    |  8 ++++++++
 net/ipv6/ndisc.c                       |  1 +
 4 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index fbe427a6580c..a0e134dd2523 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1086,6 +1086,24 @@ accept_dad - INTEGER
 	2: Enable DAD, and disable IPv6 operation if MAC-based duplicate
 	   link-local address has been found.
 
+force_tllao - BOOLEAN
+	Enable sending the target link-layer address option even when
+	responding to a unicast neighbor solicitation.
+	Default: FALSE
+
+	Quoting from RFC 2461, section 4.4, Target link-layer address:
+
+	"The option MUST be included for multicast solicitations in order to
+	avoid infinite Neighbor Solicitation "recursion" when the peer node
+	does not have a cache entry to return a Neighbor Advertisements
+	message.  When responding to unicast solicitations, the option can be
+	omitted since the sender of the solicitation has the correct link-
+	layer address; otherwise it would not have be able to send the unicast
+	solicitation in the first place. However, including the link-layer
+	address in this case adds little overhead and eliminates a potential
+	race condition where the sender deletes the cached link-layer address
+	prior to receiving a response to a previous solicitation."
+
 icmp/*:
 ratelimit - INTEGER
 	Limit the maximal rates for sending ICMPv6 packets.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index c662efa68289..ae74ede1abe7 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -167,6 +167,7 @@ struct ipv6_devconf {
 #endif
 	__s32		disable_ipv6;
 	__s32		accept_dad;
+	__s32		force_tllao;
 	void		*sysctl;
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 1fd0a3d775d2..bdcee6981c60 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4352,6 +4352,14 @@ static struct addrconf_sysctl_table
 			.mode		=	0644,
 			.proc_handler	=	proc_dointvec,
 		},
+		{
+			.ctl_name       = CTL_UNNUMBERED,
+			.procname       = "force_tllao",
+			.data           = &ipv6_devconf.force_tllao,
+			.maxlen         = sizeof(int),
+			.mode           = 0644,
+			.proc_handler   = proc_dointvec
+		},
 		{
 			.ctl_name	=	0,	/* sentinel */
 		}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index f74e4e2cdd06..3507cfe1e7a2 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -598,6 +598,7 @@ static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh,
 	icmp6h.icmp6_solicited = solicited;
 	icmp6h.icmp6_override = override;
 
+	inc_opt |= ifp->idev->cnf.force_tllao;
 	__ndisc_send(dev, neigh, daddr, src_addr,
 		     &icmp6h, solicited_addr,
 		     inc_opt ? ND_OPT_TARGET_LL_ADDR : 0);
-- 
cgit v1.2.3


From 32953543221cfe2bf0a24205fab225e5b8ed81a0 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Mon, 5 Oct 2009 06:01:03 +0000
Subject: dcb: data center bridging ops should be r/o

The data center bridging ops structure can be const

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Acked-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ixgbe/ixgbe.h        | 2 +-
 drivers/net/ixgbe/ixgbe_dcb_nl.c | 2 +-
 include/linux/netdevice.h        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ixgbe/ixgbe.h b/drivers/net/ixgbe/ixgbe.h
index 385be6016667..28f32da794dd 100644
--- a/drivers/net/ixgbe/ixgbe.h
+++ b/drivers/net/ixgbe/ixgbe.h
@@ -397,7 +397,7 @@ enum ixgbe_boards {
 extern struct ixgbe_info ixgbe_82598_info;
 extern struct ixgbe_info ixgbe_82599_info;
 #ifdef CONFIG_IXGBE_DCB
-extern struct dcbnl_rtnl_ops dcbnl_ops;
+extern const struct dcbnl_rtnl_ops dcbnl_ops;
 extern int ixgbe_copy_dcb_cfg(struct ixgbe_dcb_config *src_dcb_cfg,
                               struct ixgbe_dcb_config *dst_dcb_cfg,
                               int tc_max);
diff --git a/drivers/net/ixgbe/ixgbe_dcb_nl.c b/drivers/net/ixgbe/ixgbe_dcb_nl.c
index a6bc1ef28f92..3c7a79a7d7c6 100644
--- a/drivers/net/ixgbe/ixgbe_dcb_nl.c
+++ b/drivers/net/ixgbe/ixgbe_dcb_nl.c
@@ -563,7 +563,7 @@ static u8 ixgbe_dcbnl_setapp(struct net_device *netdev,
 	return rval;
 }
 
-struct dcbnl_rtnl_ops dcbnl_ops = {
+const struct dcbnl_rtnl_ops dcbnl_ops = {
 	.getstate	= ixgbe_dcbnl_get_state,
 	.setstate	= ixgbe_dcbnl_set_state,
 	.getpermhwaddr	= ixgbe_dcbnl_get_perm_hw_addr,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 94958c109761..b332eefebb1b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -909,7 +909,7 @@ struct net_device
 
 #ifdef CONFIG_DCB
 	/* Data Center Bridging netlink ops */
-	struct dcbnl_rtnl_ops *dcbnl_ops;
+	const struct dcbnl_rtnl_ops *dcbnl_ops;
 #endif
 
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
-- 
cgit v1.2.3


From 7c89606e24cdabaceb8ca9b3c7ab866c6bcc9e38 Mon Sep 17 00:00:00 2001
From: Holger Schurig <hs4233@mail.mn-solutions.de>
Date: Thu, 24 Sep 2009 12:21:01 +0200
Subject: nl80211: report age of scan results

Linux keeps scan results up to 15 seconds. This can be a problem for fast
moving clients: they get back stale data. But if the kernel reports the age
of the BSS items, then user-space can simply weed out old entries by itself.

Signed-off-by: Holger Schurig <hs4233@mail.mn-solutions.de>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h | 2 ++
 net/wireless/nl80211.c  | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index a8d71ed43a0e..50afca3dcff1 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -1277,6 +1277,7 @@ enum nl80211_channel_type {
  * @NL80211_BSS_SIGNAL_UNSPEC: signal strength of the probe response/beacon
  *	in unspecified units, scaled to 0..100 (u8)
  * @NL80211_BSS_STATUS: status, if this BSS is "used"
+ * @NL80211_BSS_SEEN_MS_AGO: age of this BSS entry in ms
  * @__NL80211_BSS_AFTER_LAST: internal
  * @NL80211_BSS_MAX: highest BSS attribute
  */
@@ -1291,6 +1292,7 @@ enum nl80211_bss {
 	NL80211_BSS_SIGNAL_MBM,
 	NL80211_BSS_SIGNAL_UNSPEC,
 	NL80211_BSS_STATUS,
+	NL80211_BSS_SEEN_MS_AGO,
 
 	/* keep last */
 	__NL80211_BSS_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index eddab097435c..e0ecc9f153d4 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3105,6 +3105,8 @@ static int nl80211_send_bss(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 		NLA_PUT_U16(msg, NL80211_BSS_BEACON_INTERVAL, res->beacon_interval);
 	NLA_PUT_U16(msg, NL80211_BSS_CAPABILITY, res->capability);
 	NLA_PUT_U32(msg, NL80211_BSS_FREQUENCY, res->channel->center_freq);
+	NLA_PUT_U32(msg, NL80211_BSS_SEEN_MS_AGO,
+		jiffies_to_msecs(jiffies - intbss->ts));
 
 	switch (rdev->wiphy.signal_type) {
 	case CFG80211_SIGNAL_TYPE_MBM:
-- 
cgit v1.2.3


From 125a77ed9fbd21d1277f53e9ed6b39ad3d34e613 Mon Sep 17 00:00:00 2001
From: Brian Haley <brian.haley@hp.com>
Date: Wed, 7 Oct 2009 13:57:10 -0700
Subject: IPv6: Fix 6RD build error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix build error introduced in commit fa857afcf - ipv6 sit: 6rd
(IPv6 Rapid Deployment) Support.  Struct in6_addr is the issue.
I'm only seeing this on x86_64 systems, not on 32-bit with same
IPv6 config options, so it could be there's a missing forward
declaration somewhere, but including the correct header file
fixes the problem too.

  CC [M]  net/ipv6/ip6_tunnel.o
In file included from net/ipv6/ip6_tunnel.c:31:
include/linux/if_tunnel.h:59: error: field ‘prefix’ has incomplete type
make[2]: *** [net/ipv6/ip6_tunnel.o] Error 1
make[1]: *** [net/ipv6] Error 2

Signed-off-by: Brian Haley <brian.haley@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_tunnel.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/if_tunnel.h b/include/linux/if_tunnel.h
index c53c8e016940..8d76cb4c86fa 100644
--- a/include/linux/if_tunnel.h
+++ b/include/linux/if_tunnel.h
@@ -5,6 +5,7 @@
 
 #ifdef __KERNEL__
 #include <linux/ip.h>
+#include <linux/in6.h>
 #endif
 
 #define SIOCGETTUNNEL   (SIOCDEVPRIVATE + 0)
-- 
cgit v1.2.3


From f86dcc5aa8c7908f2c287e7a211228df599e3e71 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 7 Oct 2009 00:37:59 +0000
Subject: udp: dynamically size hash tables at boot time

UDP_HTABLE_SIZE was initialy defined to 128, which is a bit small for
several setups.

4000 active UDP sockets -> 32 sockets per chain in average. An
incoming frame has to lookup all sockets to find best match, so long
chains hurt latency.

Instead of a fixed size hash table that cant be perfect for every
needs, let UDP stack choose its table size at boot time like tcp/ip
route, using alloc_large_system_hash() helper

Add an optional boot parameter, uhash_entries=x so that an admin can
force a size between 256 and 65536 if needed, like thash_entries and
rhash_entries.

dmesg logs two new lines :
[    0.647039] UDP hash table entries: 512 (order: 0, 4096 bytes)
[    0.647099] UDP Lite hash table entries: 512 (order: 0, 4096 bytes)

Maximal size on 64bit arches would be 65536 slots, ie 1 MBytes for non
debugging spinlocks.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/kernel-parameters.txt |  3 ++
 include/linux/udp.h                 |  6 +--
 include/net/udp.h                   | 13 ++++--
 net/ipv4/udp.c                      | 91 +++++++++++++++++++++++++++----------
 net/ipv4/udplite.c                  |  4 +-
 net/ipv6/udp.c                      |  6 +--
 6 files changed, 87 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 6fa7292947e5..02df20be7764 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2589,6 +2589,9 @@ and is between 256 and 4096 characters. It is defined in the file
 	uart6850=	[HW,OSS]
 			Format: <io>,<irq>
 
+	uhash_entries=	[KNL,NET]
+			Set number of hash buckets for UDP/UDP-Lite connections
+
 	uhci-hcd.ignore_oc=
 			[USB] Ignore overcurrent events (default N).
 			Some badly-designed motherboards generate lots of
diff --git a/include/linux/udp.h b/include/linux/udp.h
index 0cf5c4c0ec81..832361e3e596 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -45,11 +45,11 @@ static inline struct udphdr *udp_hdr(const struct sk_buff *skb)
 	return (struct udphdr *)skb_transport_header(skb);
 }
 
-#define UDP_HTABLE_SIZE		128
+#define UDP_HTABLE_SIZE_MIN		(CONFIG_BASE_SMALL ? 128 : 256)
 
-static inline int udp_hashfn(struct net *net, const unsigned num)
+static inline int udp_hashfn(struct net *net, unsigned num, unsigned mask)
 {
-	return (num + net_hash_mix(net)) & (UDP_HTABLE_SIZE - 1);
+	return (num + net_hash_mix(net)) & mask;
 }
 
 struct udp_sock {
diff --git a/include/net/udp.h b/include/net/udp.h
index f98abd2ce709..22aa2e7eb1d7 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -54,12 +54,19 @@ struct udp_hslot {
 	struct hlist_nulls_head	head;
 	spinlock_t		lock;
 } __attribute__((aligned(2 * sizeof(long))));
+
 struct udp_table {
-	struct udp_hslot	hash[UDP_HTABLE_SIZE];
+	struct udp_hslot	*hash;
+	unsigned int mask;
+	unsigned int log;
 };
 extern struct udp_table udp_table;
-extern void udp_table_init(struct udp_table *);
-
+extern void udp_table_init(struct udp_table *, const char *);
+static inline struct udp_hslot *udp_hashslot(struct udp_table *table,
+					     struct net *net, unsigned num)
+{
+	return &table->hash[udp_hashfn(net, num, table->mask)];
+}
 
 /* Note: this must match 'valbool' in sock_setsockopt */
 #define UDP_CSUM_NOXMIT		1
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 6ec6a8a4a224..194bcdc6d9fc 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -106,7 +106,7 @@
 #include <net/xfrm.h>
 #include "udp_impl.h"
 
-struct udp_table udp_table;
+struct udp_table udp_table __read_mostly;
 EXPORT_SYMBOL(udp_table);
 
 int sysctl_udp_mem[3] __read_mostly;
@@ -121,14 +121,16 @@ EXPORT_SYMBOL(sysctl_udp_wmem_min);
 atomic_t udp_memory_allocated;
 EXPORT_SYMBOL(udp_memory_allocated);
 
-#define PORTS_PER_CHAIN (65536 / UDP_HTABLE_SIZE)
+#define MAX_UDP_PORTS 65536
+#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
 
 static int udp_lib_lport_inuse(struct net *net, __u16 num,
 			       const struct udp_hslot *hslot,
 			       unsigned long *bitmap,
 			       struct sock *sk,
 			       int (*saddr_comp)(const struct sock *sk1,
-						 const struct sock *sk2))
+						 const struct sock *sk2),
+			       unsigned int log)
 {
 	struct sock *sk2;
 	struct hlist_nulls_node *node;
@@ -142,8 +144,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
 			|| sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
 		    (*saddr_comp)(sk, sk2)) {
 			if (bitmap)
-				__set_bit(sk2->sk_hash / UDP_HTABLE_SIZE,
-					  bitmap);
+				__set_bit(sk2->sk_hash >> log, bitmap);
 			else
 				return 1;
 		}
@@ -180,13 +181,15 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 		/*
 		 * force rand to be an odd multiple of UDP_HTABLE_SIZE
 		 */
-		rand = (rand | 1) * UDP_HTABLE_SIZE;
-		for (last = first + UDP_HTABLE_SIZE; first != last; first++) {
-			hslot = &udptable->hash[udp_hashfn(net, first)];
+		rand = (rand | 1) * (udptable->mask + 1);
+		for (last = first + udptable->mask + 1;
+		     first != last;
+		     first++) {
+			hslot = udp_hashslot(udptable, net, first);
 			bitmap_zero(bitmap, PORTS_PER_CHAIN);
 			spin_lock_bh(&hslot->lock);
 			udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
-					    saddr_comp);
+					    saddr_comp, udptable->log);
 
 			snum = first;
 			/*
@@ -196,7 +199,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 			 */
 			do {
 				if (low <= snum && snum <= high &&
-				    !test_bit(snum / UDP_HTABLE_SIZE, bitmap))
+				    !test_bit(snum >> udptable->log, bitmap))
 					goto found;
 				snum += rand;
 			} while (snum != first);
@@ -204,9 +207,10 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 		}
 		goto fail;
 	} else {
-		hslot = &udptable->hash[udp_hashfn(net, snum)];
+		hslot = udp_hashslot(udptable, net, snum);
 		spin_lock_bh(&hslot->lock);
-		if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, saddr_comp))
+		if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk,
+					saddr_comp, 0))
 			goto fail_unlock;
 	}
 found:
@@ -283,7 +287,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 	struct sock *sk, *result;
 	struct hlist_nulls_node *node;
 	unsigned short hnum = ntohs(dport);
-	unsigned int hash = udp_hashfn(net, hnum);
+	unsigned int hash = udp_hashfn(net, hnum, udptable->mask);
 	struct udp_hslot *hslot = &udptable->hash[hash];
 	int score, badness;
 
@@ -1013,8 +1017,8 @@ void udp_lib_unhash(struct sock *sk)
 {
 	if (sk_hashed(sk)) {
 		struct udp_table *udptable = sk->sk_prot->h.udp_table;
-		unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash);
-		struct udp_hslot *hslot = &udptable->hash[hash];
+		struct udp_hslot *hslot = udp_hashslot(udptable, sock_net(sk),
+						     sk->sk_hash);
 
 		spin_lock_bh(&hslot->lock);
 		if (sk_nulls_del_node_init_rcu(sk)) {
@@ -1169,7 +1173,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 				    struct udp_table *udptable)
 {
 	struct sock *sk;
-	struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))];
+	struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
 	int dif;
 
 	spin_lock(&hslot->lock);
@@ -1609,9 +1613,14 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
 	struct udp_iter_state *state = seq->private;
 	struct net *net = seq_file_net(seq);
 
-	for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) {
+	for (state->bucket = start; state->bucket <= state->udp_table->mask;
+	     ++state->bucket) {
 		struct hlist_nulls_node *node;
 		struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
+
+		if (hlist_nulls_empty(&hslot->head))
+			continue;
+
 		spin_lock_bh(&hslot->lock);
 		sk_nulls_for_each(sk, node, &hslot->head) {
 			if (!net_eq(sock_net(sk), net))
@@ -1636,7 +1645,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
 	} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
 
 	if (!sk) {
-		if (state->bucket < UDP_HTABLE_SIZE)
+		if (state->bucket <= state->udp_table->mask)
 			spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
 		return udp_get_first(seq, state->bucket + 1);
 	}
@@ -1656,7 +1665,7 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
 static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct udp_iter_state *state = seq->private;
-	state->bucket = UDP_HTABLE_SIZE;
+	state->bucket = MAX_UDP_PORTS;
 
 	return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
 }
@@ -1678,7 +1687,7 @@ static void udp_seq_stop(struct seq_file *seq, void *v)
 {
 	struct udp_iter_state *state = seq->private;
 
-	if (state->bucket < UDP_HTABLE_SIZE)
+	if (state->bucket <= state->udp_table->mask)
 		spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
 }
 
@@ -1738,7 +1747,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
 	__u16 destp	  = ntohs(inet->dport);
 	__u16 srcp	  = ntohs(inet->sport);
 
-	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
+	seq_printf(f, "%5d: %08X:%04X %08X:%04X"
 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n",
 		bucket, src, srcp, dest, destp, sp->sk_state,
 		sk_wmem_alloc_get(sp),
@@ -1804,11 +1813,43 @@ void udp4_proc_exit(void)
 }
 #endif /* CONFIG_PROC_FS */
 
-void __init udp_table_init(struct udp_table *table)
+static __initdata unsigned long uhash_entries;
+static int __init set_uhash_entries(char *str)
 {
-	int i;
+	if (!str)
+		return 0;
+	uhash_entries = simple_strtoul(str, &str, 0);
+	if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
+		uhash_entries = UDP_HTABLE_SIZE_MIN;
+	return 1;
+}
+__setup("uhash_entries=", set_uhash_entries);
 
-	for (i = 0; i < UDP_HTABLE_SIZE; i++) {
+void __init udp_table_init(struct udp_table *table, const char *name)
+{
+	unsigned int i;
+
+	if (!CONFIG_BASE_SMALL)
+		table->hash = alloc_large_system_hash(name,
+			sizeof(struct udp_hslot),
+			uhash_entries,
+			21, /* one slot per 2 MB */
+			0,
+			&table->log,
+			&table->mask,
+			64 * 1024);
+	/*
+	 * Make sure hash table has the minimum size
+	 */
+	if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) {
+		table->hash = kmalloc(UDP_HTABLE_SIZE_MIN *
+				      sizeof(struct udp_hslot), GFP_KERNEL);
+		if (!table->hash)
+			panic(name);
+		table->log = ilog2(UDP_HTABLE_SIZE_MIN);
+		table->mask = UDP_HTABLE_SIZE_MIN - 1;
+	}
+	for (i = 0; i <= table->mask; i++) {
 		INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
 		spin_lock_init(&table->hash[i].lock);
 	}
@@ -1818,7 +1859,7 @@ void __init udp_init(void)
 {
 	unsigned long nr_pages, limit;
 
-	udp_table_init(&udp_table);
+	udp_table_init(&udp_table, "UDP");
 	/* Set the pressure threshold up by the same strategy of TCP. It is a
 	 * fraction of global memory that is up to 1/2 at 256 MB, decreasing
 	 * toward zero with the amount of memory, with a floor of 128 pages.
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 95248d7f75ec..470c504b9554 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -12,7 +12,7 @@
  */
 #include "udp_impl.h"
 
-struct udp_table 	udplite_table;
+struct udp_table 	udplite_table __read_mostly;
 EXPORT_SYMBOL(udplite_table);
 
 static int udplite_rcv(struct sk_buff *skb)
@@ -110,7 +110,7 @@ static inline int udplite4_proc_init(void)
 
 void __init udplite4_register(void)
 {
-	udp_table_init(&udplite_table);
+	udp_table_init(&udplite_table, "UDP-Lite");
 	if (proto_register(&udplite_prot, 1))
 		goto out_register_err;
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index c6a303ec834c..ff778c172ef2 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -132,7 +132,7 @@ static struct sock *__udp6_lib_lookup(struct net *net,
 	struct sock *sk, *result;
 	struct hlist_nulls_node *node;
 	unsigned short hnum = ntohs(dport);
-	unsigned int hash = udp_hashfn(net, hnum);
+	unsigned int hash = udp_hashfn(net, hnum, udptable->mask);
 	struct udp_hslot *hslot = &udptable->hash[hash];
 	int score, badness;
 
@@ -452,7 +452,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 {
 	struct sock *sk, *sk2;
 	const struct udphdr *uh = udp_hdr(skb);
-	struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))];
+	struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
 	int dif;
 
 	spin_lock(&hslot->lock);
@@ -1197,7 +1197,7 @@ static void udp6_sock_seq_show(struct seq_file *seq, struct sock *sp, int bucket
 	destp = ntohs(inet->dport);
 	srcp  = ntohs(inet->sport);
 	seq_printf(seq,
-		   "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
+		   "%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
 		   "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n",
 		   bucket,
 		   src->s6_addr32[0], src->s6_addr32[1],
-- 
cgit v1.2.3


From 3758bf25db8caeec667e4e56e030da0ec3060529 Mon Sep 17 00:00:00 2001
From: Anant Gole <anantgole@ti.com>
Date: Wed, 7 Oct 2009 02:59:47 +0000
Subject: can: add TI CAN (HECC) driver

TI HECC (High End CAN Controller) module is found on many TI devices. It
has 32 hardware mailboxes with full implementation of CAN protocol 2.0B
with bus speeds up to 1Mbps. Specifications of the module are available
on TI web <http://www.ti.com>

Signed-off-by: Anant Gole <anantgole@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/can/Kconfig              |    7 +
 drivers/net/can/Makefile             |    1 +
 drivers/net/can/ti_hecc.c            | 1006 ++++++++++++++++++++++++++++++++++
 include/linux/can/platform/ti_hecc.h |   40 ++
 4 files changed, 1054 insertions(+)
 create mode 100644 drivers/net/can/ti_hecc.c
 create mode 100644 include/linux/can/platform/ti_hecc.h

(limited to 'include/linux')

diff --git a/drivers/net/can/Kconfig b/drivers/net/can/Kconfig
index df32c109b7ac..26d77cc0ded7 100644
--- a/drivers/net/can/Kconfig
+++ b/drivers/net/can/Kconfig
@@ -95,6 +95,13 @@ config CAN_AT91
 	---help---
 	  This is a driver for the SoC CAN controller in Atmel's AT91SAM9263.
 
+config CAN_TI_HECC
+	depends on CAN_DEV && ARCH_OMAP3
+	tristate "TI High End CAN Controller"
+	---help---
+	  Driver for TI HECC (High End CAN Controller) module found on many
+	  TI devices. The device specifications are available from www.ti.com
+
 config CAN_DEBUG_DEVICES
 	bool "CAN devices debugging messages"
 	depends on CAN
diff --git a/drivers/net/can/Makefile b/drivers/net/can/Makefile
index 0dea62721f2f..31f4ab5df28b 100644
--- a/drivers/net/can/Makefile
+++ b/drivers/net/can/Makefile
@@ -11,5 +11,6 @@ obj-y				+= usb/
 
 obj-$(CONFIG_CAN_SJA1000)	+= sja1000/
 obj-$(CONFIG_CAN_AT91)		+= at91_can.o
+obj-$(CONFIG_CAN_TI_HECC)	+= ti_hecc.o
 
 ccflags-$(CONFIG_CAN_DEBUG_DEVICES) := -DDEBUG
diff --git a/drivers/net/can/ti_hecc.c b/drivers/net/can/ti_hecc.c
new file mode 100644
index 000000000000..814e6c5c6386
--- /dev/null
+++ b/drivers/net/can/ti_hecc.c
@@ -0,0 +1,1006 @@
+/*
+ * TI HECC (CAN) device driver
+ *
+ * This driver supports TI's HECC (High End CAN Controller module) and the
+ * specs for the same is available at <http://www.ti.com>
+ *
+ * Copyright (C) 2009 Texas Instruments Incorporated - http://www.ti.com/
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation version 2.
+ *
+ * This program is distributed as is WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+/*
+ * Your platform definitions should specify module ram offsets and interrupt
+ * number to use as follows:
+ *
+ * static struct ti_hecc_platform_data am3517_evm_hecc_pdata = {
+ *         .scc_hecc_offset        = 0,
+ *         .scc_ram_offset         = 0x3000,
+ *         .hecc_ram_offset        = 0x3000,
+ *         .mbx_offset             = 0x2000,
+ *         .int_line               = 0,
+ *         .revision               = 1,
+ * };
+ *
+ * Please see include/can/platform/ti_hecc.h for description of above fields
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/platform_device.h>
+#include <linux/clk.h>
+
+#include <linux/can.h>
+#include <linux/can/dev.h>
+#include <linux/can/error.h>
+#include <linux/can/platform/ti_hecc.h>
+
+#define DRV_NAME "ti_hecc"
+#define HECC_MODULE_VERSION     "0.7"
+MODULE_VERSION(HECC_MODULE_VERSION);
+#define DRV_DESC "TI High End CAN Controller Driver " HECC_MODULE_VERSION
+
+/* TX / RX Mailbox Configuration */
+#define HECC_MAX_MAILBOXES	32	/* hardware mailboxes - do not change */
+#define MAX_TX_PRIO		0x3F	/* hardware value - do not change */
+
+/*
+ * Important Note: TX mailbox configuration
+ * TX mailboxes should be restricted to the number of SKB buffers to avoid
+ * maintaining SKB buffers separately. TX mailboxes should be a power of 2
+ * for the mailbox logic to work.  Top mailbox numbers are reserved for RX
+ * and lower mailboxes for TX.
+ *
+ * HECC_MAX_TX_MBOX	HECC_MB_TX_SHIFT
+ * 4 (default)		2
+ * 8			3
+ * 16			4
+ */
+#define HECC_MB_TX_SHIFT	2 /* as per table above */
+#define HECC_MAX_TX_MBOX	BIT(HECC_MB_TX_SHIFT)
+
+#if (HECC_MAX_TX_MBOX > CAN_ECHO_SKB_MAX)
+#error "HECC: MAX TX mailboxes should be equal or less than CAN_ECHO_SKB_MAX"
+#endif
+
+#define HECC_TX_PRIO_SHIFT	(HECC_MB_TX_SHIFT)
+#define HECC_TX_PRIO_MASK	(MAX_TX_PRIO << HECC_MB_TX_SHIFT)
+#define HECC_TX_MB_MASK		(HECC_MAX_TX_MBOX - 1)
+#define HECC_TX_MASK		((HECC_MAX_TX_MBOX - 1) | HECC_TX_PRIO_MASK)
+#define HECC_TX_MBOX_MASK	(~(BIT(HECC_MAX_TX_MBOX) - 1))
+#define HECC_DEF_NAPI_WEIGHT	HECC_MAX_RX_MBOX
+
+/*
+ * Important Note: RX mailbox configuration
+ * RX mailboxes are further logically split into two - main and buffer
+ * mailboxes. The goal is to get all packets into main mailboxes as
+ * driven by mailbox number and receive priority (higher to lower) and
+ * buffer mailboxes are used to receive pkts while main mailboxes are being
+ * processed. This ensures in-order packet reception.
+ *
+ * Here are the recommended values for buffer mailbox. Note that RX mailboxes
+ * start after TX mailboxes:
+ *
+ * HECC_MAX_RX_MBOX		HECC_RX_BUFFER_MBOX	No of buffer mailboxes
+ * 28				12			8
+ * 16				20			4
+ */
+
+#define HECC_MAX_RX_MBOX	(HECC_MAX_MAILBOXES - HECC_MAX_TX_MBOX)
+#define HECC_RX_BUFFER_MBOX	12 /* as per table above */
+#define HECC_RX_FIRST_MBOX	(HECC_MAX_MAILBOXES - 1)
+#define HECC_RX_HIGH_MBOX_MASK	(~(BIT(HECC_RX_BUFFER_MBOX) - 1))
+
+/* TI HECC module registers */
+#define HECC_CANME		0x0	/* Mailbox enable */
+#define HECC_CANMD		0x4	/* Mailbox direction */
+#define HECC_CANTRS		0x8	/* Transmit request set */
+#define HECC_CANTRR		0xC	/* Transmit request */
+#define HECC_CANTA		0x10	/* Transmission acknowledge */
+#define HECC_CANAA		0x14	/* Abort acknowledge */
+#define HECC_CANRMP		0x18	/* Receive message pending */
+#define HECC_CANRML		0x1C	/* Remote message lost */
+#define HECC_CANRFP		0x20	/* Remote frame pending */
+#define HECC_CANGAM		0x24	/* SECC only:Global acceptance mask */
+#define HECC_CANMC		0x28	/* Master control */
+#define HECC_CANBTC		0x2C	/* Bit timing configuration */
+#define HECC_CANES		0x30	/* Error and status */
+#define HECC_CANTEC		0x34	/* Transmit error counter */
+#define HECC_CANREC		0x38	/* Receive error counter */
+#define HECC_CANGIF0		0x3C	/* Global interrupt flag 0 */
+#define HECC_CANGIM		0x40	/* Global interrupt mask */
+#define HECC_CANGIF1		0x44	/* Global interrupt flag 1 */
+#define HECC_CANMIM		0x48	/* Mailbox interrupt mask */
+#define HECC_CANMIL		0x4C	/* Mailbox interrupt level */
+#define HECC_CANOPC		0x50	/* Overwrite protection control */
+#define HECC_CANTIOC		0x54	/* Transmit I/O control */
+#define HECC_CANRIOC		0x58	/* Receive I/O control */
+#define HECC_CANLNT		0x5C	/* HECC only: Local network time */
+#define HECC_CANTOC		0x60	/* HECC only: Time-out control */
+#define HECC_CANTOS		0x64	/* HECC only: Time-out status */
+#define HECC_CANTIOCE		0x68	/* SCC only:Enhanced TX I/O control */
+#define HECC_CANRIOCE		0x6C	/* SCC only:Enhanced RX I/O control */
+
+/* Mailbox registers */
+#define HECC_CANMID		0x0
+#define HECC_CANMCF		0x4
+#define HECC_CANMDL		0x8
+#define HECC_CANMDH		0xC
+
+#define HECC_SET_REG		0xFFFFFFFF
+#define HECC_CANID_MASK		0x3FF	/* 18 bits mask for extended id's */
+#define HECC_CCE_WAIT_COUNT     100	/* Wait for ~1 sec for CCE bit */
+
+#define HECC_CANMC_SCM		BIT(13)	/* SCC compat mode */
+#define HECC_CANMC_CCR		BIT(12)	/* Change config request */
+#define HECC_CANMC_PDR		BIT(11)	/* Local Power down - for sleep mode */
+#define HECC_CANMC_ABO		BIT(7)	/* Auto Bus On */
+#define HECC_CANMC_STM		BIT(6)	/* Self test mode - loopback */
+#define HECC_CANMC_SRES		BIT(5)	/* Software reset */
+
+#define HECC_CANTIOC_EN		BIT(3)	/* Enable CAN TX I/O pin */
+#define HECC_CANRIOC_EN		BIT(3)	/* Enable CAN RX I/O pin */
+
+#define HECC_CANMID_IDE		BIT(31)	/* Extended frame format */
+#define HECC_CANMID_AME		BIT(30)	/* Acceptance mask enable */
+#define HECC_CANMID_AAM		BIT(29)	/* Auto answer mode */
+
+#define HECC_CANES_FE		BIT(24)	/* form error */
+#define HECC_CANES_BE		BIT(23)	/* bit error */
+#define HECC_CANES_SA1		BIT(22)	/* stuck at dominant error */
+#define HECC_CANES_CRCE		BIT(21)	/* CRC error */
+#define HECC_CANES_SE		BIT(20)	/* stuff bit error */
+#define HECC_CANES_ACKE		BIT(19)	/* ack error */
+#define HECC_CANES_BO		BIT(18)	/* Bus off status */
+#define HECC_CANES_EP		BIT(17)	/* Error passive status */
+#define HECC_CANES_EW		BIT(16)	/* Error warning status */
+#define HECC_CANES_SMA		BIT(5)	/* suspend mode ack */
+#define HECC_CANES_CCE		BIT(4)	/* Change config enabled */
+#define HECC_CANES_PDA		BIT(3)	/* Power down mode ack */
+
+#define HECC_CANBTC_SAM		BIT(7)	/* sample points */
+
+#define HECC_BUS_ERROR		(HECC_CANES_FE | HECC_CANES_BE |\
+				HECC_CANES_CRCE | HECC_CANES_SE |\
+				HECC_CANES_ACKE)
+
+#define HECC_CANMCF_RTR		BIT(4)	/* Remote transmit request */
+
+#define HECC_CANGIF_MAIF	BIT(17)	/* Message alarm interrupt */
+#define HECC_CANGIF_TCOIF	BIT(16) /* Timer counter overflow int */
+#define HECC_CANGIF_GMIF	BIT(15)	/* Global mailbox interrupt */
+#define HECC_CANGIF_AAIF	BIT(14)	/* Abort ack interrupt */
+#define HECC_CANGIF_WDIF	BIT(13)	/* Write denied interrupt */
+#define HECC_CANGIF_WUIF	BIT(12)	/* Wake up interrupt */
+#define HECC_CANGIF_RMLIF	BIT(11)	/* Receive message lost interrupt */
+#define HECC_CANGIF_BOIF	BIT(10)	/* Bus off interrupt */
+#define HECC_CANGIF_EPIF	BIT(9)	/* Error passive interrupt */
+#define HECC_CANGIF_WLIF	BIT(8)	/* Warning level interrupt */
+#define HECC_CANGIF_MBOX_MASK	0x1F	/* Mailbox number mask */
+#define HECC_CANGIM_I1EN	BIT(1)	/* Int line 1 enable */
+#define HECC_CANGIM_I0EN	BIT(0)	/* Int line 0 enable */
+#define HECC_CANGIM_DEF_MASK	0x700	/* only busoff/warning/passive */
+#define HECC_CANGIM_SIL		BIT(2)	/* system interrupts to int line 1 */
+
+/* CAN Bittiming constants as per HECC specs */
+static struct can_bittiming_const ti_hecc_bittiming_const = {
+	.name = DRV_NAME,
+	.tseg1_min = 1,
+	.tseg1_max = 16,
+	.tseg2_min = 1,
+	.tseg2_max = 8,
+	.sjw_max = 4,
+	.brp_min = 1,
+	.brp_max = 256,
+	.brp_inc = 1,
+};
+
+struct ti_hecc_priv {
+	struct can_priv can;	/* MUST be first member/field */
+	struct napi_struct napi;
+	struct net_device *ndev;
+	struct clk *clk;
+	void __iomem *base;
+	u32 scc_ram_offset;
+	u32 hecc_ram_offset;
+	u32 mbx_offset;
+	u32 int_line;
+	spinlock_t mbx_lock; /* CANME register needs protection */
+	u32 tx_head;
+	u32 tx_tail;
+	u32 rx_next;
+};
+
+static inline int get_tx_head_mb(struct ti_hecc_priv *priv)
+{
+	return priv->tx_head & HECC_TX_MB_MASK;
+}
+
+static inline int get_tx_tail_mb(struct ti_hecc_priv *priv)
+{
+	return priv->tx_tail & HECC_TX_MB_MASK;
+}
+
+static inline int get_tx_head_prio(struct ti_hecc_priv *priv)
+{
+	return (priv->tx_head >> HECC_TX_PRIO_SHIFT) & MAX_TX_PRIO;
+}
+
+static inline void hecc_write_lam(struct ti_hecc_priv *priv, u32 mbxno, u32 val)
+{
+	__raw_writel(val, priv->base + priv->hecc_ram_offset + mbxno * 4);
+}
+
+static inline void hecc_write_mbx(struct ti_hecc_priv *priv, u32 mbxno,
+	u32 reg, u32 val)
+{
+	__raw_writel(val, priv->base + priv->mbx_offset + mbxno * 0x10 +
+			reg);
+}
+
+static inline u32 hecc_read_mbx(struct ti_hecc_priv *priv, u32 mbxno, u32 reg)
+{
+	return __raw_readl(priv->base + priv->mbx_offset + mbxno * 0x10 +
+			reg);
+}
+
+static inline void hecc_write(struct ti_hecc_priv *priv, u32 reg, u32 val)
+{
+	__raw_writel(val, priv->base + reg);
+}
+
+static inline u32 hecc_read(struct ti_hecc_priv *priv, int reg)
+{
+	return __raw_readl(priv->base + reg);
+}
+
+static inline void hecc_set_bit(struct ti_hecc_priv *priv, int reg,
+	u32 bit_mask)
+{
+	hecc_write(priv, reg, hecc_read(priv, reg) | bit_mask);
+}
+
+static inline void hecc_clear_bit(struct ti_hecc_priv *priv, int reg,
+	u32 bit_mask)
+{
+	hecc_write(priv, reg, hecc_read(priv, reg) & ~bit_mask);
+}
+
+static inline u32 hecc_get_bit(struct ti_hecc_priv *priv, int reg, u32 bit_mask)
+{
+	return (hecc_read(priv, reg) & bit_mask) ? 1 : 0;
+}
+
+static int ti_hecc_get_state(const struct net_device *ndev,
+	enum can_state *state)
+{
+	struct ti_hecc_priv *priv = netdev_priv(ndev);
+
+	*state = priv->can.state;
+	return 0;
+}
+
+static int ti_hecc_set_btc(struct ti_hecc_priv *priv)
+{
+	struct can_bittiming *bit_timing = &priv->can.bittiming;
+	u32 can_btc;
+
+	can_btc = (bit_timing->phase_seg2 - 1) & 0x7;
+	can_btc |= ((bit_timing->phase_seg1 + bit_timing->prop_seg - 1)
+			& 0xF) << 3;
+	if (priv->can.ctrlmode & CAN_CTRLMODE_3_SAMPLES) {
+		if (bit_timing->brp > 4)
+			can_btc |= HECC_CANBTC_SAM;
+		else
+			dev_warn(priv->ndev->dev.parent, "WARN: Triple" \
+				"sampling not set due to h/w limitations");
+	}
+	can_btc |= ((bit_timing->sjw - 1) & 0x3) << 8;
+	can_btc |= ((bit_timing->brp - 1) & 0xFF) << 16;
+
+	/* ERM being set to 0 by default meaning resync at falling edge */
+
+	hecc_write(priv, HECC_CANBTC, can_btc);
+	dev_info(priv->ndev->dev.parent, "setting CANBTC=%#x\n", can_btc);
+
+	return 0;
+}
+
+static void ti_hecc_reset(struct net_device *ndev)
+{
+	u32 cnt;
+	struct ti_hecc_priv *priv = netdev_priv(ndev);
+
+	dev_dbg(ndev->dev.parent, "resetting hecc ...\n");
+	hecc_set_bit(priv, HECC_CANMC, HECC_CANMC_SRES);
+
+	/* Set change control request and wait till enabled */
+	hecc_set_bit(priv, HECC_CANMC, HECC_CANMC_CCR);
+
+	/*
+	 * INFO: It has been observed that at times CCE bit may not be
+	 * set and hw seems to be ok even if this bit is not set so
+	 * timing out with a timing of 1ms to respect the specs
+	 */
+	cnt = HECC_CCE_WAIT_COUNT;
+	while (!hecc_get_bit(priv, HECC_CANES, HECC_CANES_CCE) && cnt != 0) {
+		--cnt;
+		udelay(10);
+	}
+
+	/*
+	 * Note: On HECC, BTC can be programmed only in initialization mode, so
+	 * it is expected that the can bittiming parameters are set via ip
+	 * utility before the device is opened
+	 */
+	ti_hecc_set_btc(priv);
+
+	/* Clear CCR (and CANMC register) and wait for CCE = 0 enable */
+	hecc_write(priv, HECC_CANMC, 0);
+
+	/*
+	 * INFO: CAN net stack handles bus off and hence disabling auto-bus-on
+	 * hecc_set_bit(priv, HECC_CANMC, HECC_CANMC_ABO);
+	 */
+
+	/*
+	 * INFO: It has been observed that at times CCE bit may not be
+	 * set and hw seems to be ok even if this bit is not set so
+	 */
+	cnt = HECC_CCE_WAIT_COUNT;
+	while (hecc_get_bit(priv, HECC_CANES, HECC_CANES_CCE) && cnt != 0) {
+		--cnt;
+		udelay(10);
+	}
+
+	/* Enable TX and RX I/O Control pins */
+	hecc_write(priv, HECC_CANTIOC, HECC_CANTIOC_EN);
+	hecc_write(priv, HECC_CANRIOC, HECC_CANRIOC_EN);
+
+	/* Clear registers for clean operation */
+	hecc_write(priv, HECC_CANTA, HECC_SET_REG);
+	hecc_write(priv, HECC_CANRMP, HECC_SET_REG);
+	hecc_write(priv, HECC_CANGIF0, HECC_SET_REG);
+	hecc_write(priv, HECC_CANGIF1, HECC_SET_REG);
+	hecc_write(priv, HECC_CANME, 0);
+	hecc_write(priv, HECC_CANMD, 0);
+
+	/* SCC compat mode NOT supported (and not needed too) */
+	hecc_set_bit(priv, HECC_CANMC, HECC_CANMC_SCM);
+}
+
+static void ti_hecc_start(struct net_device *ndev)
+{
+	struct ti_hecc_priv *priv = netdev_priv(ndev);
+	u32 cnt, mbxno, mbx_mask;
+
+	/* put HECC in initialization mode and set btc */
+	ti_hecc_reset(ndev);
+
+	priv->tx_head = priv->tx_tail = HECC_TX_MASK;
+	priv->rx_next = HECC_RX_FIRST_MBOX;
+
+	/* Enable local and global acceptance mask registers */
+	hecc_write(priv, HECC_CANGAM, HECC_SET_REG);
+
+	/* Prepare configured mailboxes to receive messages */
+	for (cnt = 0; cnt < HECC_MAX_RX_MBOX; cnt++) {
+		mbxno = HECC_MAX_MAILBOXES - 1 - cnt;
+		mbx_mask = BIT(mbxno);
+		hecc_clear_bit(priv, HECC_CANME, mbx_mask);
+		hecc_write_mbx(priv, mbxno, HECC_CANMID, HECC_CANMID_AME);
+		hecc_write_lam(priv, mbxno, HECC_SET_REG);
+		hecc_set_bit(priv, HECC_CANMD, mbx_mask);
+		hecc_set_bit(priv, HECC_CANME, mbx_mask);
+		hecc_set_bit(priv, HECC_CANMIM, mbx_mask);
+	}
+
+	/* Prevent message over-write & Enable interrupts */
+	hecc_write(priv, HECC_CANOPC, HECC_SET_REG);
+	if (priv->int_line) {
+		hecc_write(priv, HECC_CANMIL, HECC_SET_REG);
+		hecc_write(priv, HECC_CANGIM, HECC_CANGIM_DEF_MASK |
+			HECC_CANGIM_I1EN | HECC_CANGIM_SIL);
+	} else {
+		hecc_write(priv, HECC_CANMIL, 0);
+		hecc_write(priv, HECC_CANGIM,
+			HECC_CANGIM_DEF_MASK | HECC_CANGIM_I0EN);
+	}
+	priv->can.state = CAN_STATE_ERROR_ACTIVE;
+}
+
+static void ti_hecc_stop(struct net_device *ndev)
+{
+	struct ti_hecc_priv *priv = netdev_priv(ndev);
+
+	/* Disable interrupts and disable mailboxes */
+	hecc_write(priv, HECC_CANGIM, 0);
+	hecc_write(priv, HECC_CANMIM, 0);
+	hecc_write(priv, HECC_CANME, 0);
+	priv->can.state = CAN_STATE_STOPPED;
+}
+
+static int ti_hecc_do_set_mode(struct net_device *ndev, enum can_mode mode)
+{
+	int ret = 0;
+
+	switch (mode) {
+	case CAN_MODE_START:
+		ti_hecc_start(ndev);
+		netif_wake_queue(ndev);
+		break;
+	default:
+		ret = -EOPNOTSUPP;
+		break;
+	}
+
+	return ret;
+}
+
+/*
+ * ti_hecc_xmit: HECC Transmit
+ *
+ * The transmit mailboxes start from 0 to HECC_MAX_TX_MBOX. In HECC the
+ * priority of the mailbox for tranmission is dependent upon priority setting
+ * field in mailbox registers. The mailbox with highest value in priority field
+ * is transmitted first. Only when two mailboxes have the same value in
+ * priority field the highest numbered mailbox is transmitted first.
+ *
+ * To utilize the HECC priority feature as described above we start with the
+ * highest numbered mailbox with highest priority level and move on to the next
+ * mailbox with the same priority level and so on. Once we loop through all the
+ * transmit mailboxes we choose the next priority level (lower) and so on
+ * until we reach the lowest priority level on the lowest numbered mailbox
+ * when we stop transmission until all mailboxes are transmitted and then
+ * restart at highest numbered mailbox with highest priority.
+ *
+ * Two counters (head and tail) are used to track the next mailbox to transmit
+ * and to track the echo buffer for already transmitted mailbox. The queue
+ * is stopped when all the mailboxes are busy or when there is a priority
+ * value roll-over happens.
+ */
+static netdev_tx_t ti_hecc_xmit(struct sk_buff *skb, struct net_device *ndev)
+{
+	struct ti_hecc_priv *priv = netdev_priv(ndev);
+	struct can_frame *cf = (struct can_frame *)skb->data;
+	u32 mbxno, mbx_mask, data;
+	unsigned long flags;
+
+	mbxno = get_tx_head_mb(priv);
+	mbx_mask = BIT(mbxno);
+	spin_lock_irqsave(&priv->mbx_lock, flags);
+	if (unlikely(hecc_read(priv, HECC_CANME) & mbx_mask)) {
+		spin_unlock_irqrestore(&priv->mbx_lock, flags);
+		netif_stop_queue(ndev);
+		dev_err(priv->ndev->dev.parent,
+			"BUG: TX mbx not ready tx_head=%08X, tx_tail=%08X\n",
+			priv->tx_head, priv->tx_tail);
+		return NETDEV_TX_BUSY;
+	}
+	spin_unlock_irqrestore(&priv->mbx_lock, flags);
+
+	/* Prepare mailbox for transmission */
+	data = min_t(u8, cf->can_dlc, 8);
+	if (cf->can_id & CAN_RTR_FLAG) /* Remote transmission request */
+		data |= HECC_CANMCF_RTR;
+	data |= get_tx_head_prio(priv) << 8;
+	hecc_write_mbx(priv, mbxno, HECC_CANMCF, data);
+
+	if (cf->can_id & CAN_EFF_FLAG) /* Extended frame format */
+		data = (cf->can_id & CAN_EFF_MASK) | HECC_CANMID_IDE;
+	else /* Standard frame format */
+		data = (cf->can_id & CAN_SFF_MASK) << 18;
+	hecc_write_mbx(priv, mbxno, HECC_CANMID, data);
+	hecc_write_mbx(priv, mbxno, HECC_CANMDL,
+		be32_to_cpu(*(u32 *)(cf->data)));
+	if (cf->can_dlc > 4)
+		hecc_write_mbx(priv, mbxno, HECC_CANMDH,
+			be32_to_cpu(*(u32 *)(cf->data + 4)));
+	else
+		*(u32 *)(cf->data + 4) = 0;
+	can_put_echo_skb(skb, ndev, mbxno);
+
+	spin_lock_irqsave(&priv->mbx_lock, flags);
+	--priv->tx_head;
+	if ((hecc_read(priv, HECC_CANME) & BIT(get_tx_head_mb(priv))) ||
+		(priv->tx_head & HECC_TX_MASK) == HECC_TX_MASK) {
+		netif_stop_queue(ndev);
+	}
+	hecc_set_bit(priv, HECC_CANME, mbx_mask);
+	spin_unlock_irqrestore(&priv->mbx_lock, flags);
+
+	hecc_clear_bit(priv, HECC_CANMD, mbx_mask);
+	hecc_set_bit(priv, HECC_CANMIM, mbx_mask);
+	hecc_write(priv, HECC_CANTRS, mbx_mask);
+
+	return NETDEV_TX_OK;
+}
+
+static int ti_hecc_rx_pkt(struct ti_hecc_priv *priv, int mbxno)
+{
+	struct net_device_stats *stats = &priv->ndev->stats;
+	struct can_frame *cf;
+	struct sk_buff *skb;
+	u32 data, mbx_mask;
+	unsigned long flags;
+
+	skb = netdev_alloc_skb(priv->ndev, sizeof(struct can_frame));
+	if (!skb) {
+		if (printk_ratelimit())
+			dev_err(priv->ndev->dev.parent,
+				"ti_hecc_rx_pkt: netdev_alloc_skb() failed\n");
+		return -ENOMEM;
+	}
+	skb->protocol = __constant_htons(ETH_P_CAN);
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	mbx_mask = BIT(mbxno);
+	cf = (struct can_frame *)skb_put(skb, sizeof(struct can_frame));
+	data = hecc_read_mbx(priv, mbxno, HECC_CANMID);
+	if (data & HECC_CANMID_IDE)
+		cf->can_id = (data & CAN_EFF_MASK) | CAN_EFF_FLAG;
+	else
+		cf->can_id = (data >> 18) & CAN_SFF_MASK;
+	data = hecc_read_mbx(priv, mbxno, HECC_CANMCF);
+	if (data & HECC_CANMCF_RTR)
+		cf->can_id |= CAN_RTR_FLAG;
+	cf->can_dlc = data & 0xF;
+	data = hecc_read_mbx(priv, mbxno, HECC_CANMDL);
+	*(u32 *)(cf->data) = cpu_to_be32(data);
+	if (cf->can_dlc > 4) {
+		data = hecc_read_mbx(priv, mbxno, HECC_CANMDH);
+		*(u32 *)(cf->data + 4) = cpu_to_be32(data);
+	} else {
+		*(u32 *)(cf->data + 4) = 0;
+	}
+	spin_lock_irqsave(&priv->mbx_lock, flags);
+	hecc_clear_bit(priv, HECC_CANME, mbx_mask);
+	hecc_write(priv, HECC_CANRMP, mbx_mask);
+	/* enable mailbox only if it is part of rx buffer mailboxes */
+	if (priv->rx_next < HECC_RX_BUFFER_MBOX)
+		hecc_set_bit(priv, HECC_CANME, mbx_mask);
+	spin_unlock_irqrestore(&priv->mbx_lock, flags);
+
+	stats->rx_bytes += cf->can_dlc;
+	netif_receive_skb(skb);
+	stats->rx_packets++;
+
+	return 0;
+}
+
+/*
+ * ti_hecc_rx_poll - HECC receive pkts
+ *
+ * The receive mailboxes start from highest numbered mailbox till last xmit
+ * mailbox. On CAN frame reception the hardware places the data into highest
+ * numbered mailbox that matches the CAN ID filter. Since all receive mailboxes
+ * have same filtering (ALL CAN frames) packets will arrive in the highest
+ * available RX mailbox and we need to ensure in-order packet reception.
+ *
+ * To ensure the packets are received in the right order we logically divide
+ * the RX mailboxes into main and buffer mailboxes. Packets are received as per
+ * mailbox priotity (higher to lower) in the main bank and once it is full we
+ * disable further reception into main mailboxes. While the main mailboxes are
+ * processed in NAPI, further packets are received in buffer mailboxes.
+ *
+ * We maintain a RX next mailbox counter to process packets and once all main
+ * mailboxe packets are passed to the upper stack we enable all of them but
+ * continue to process packets received in buffer mailboxes. With each packet
+ * received from buffer mailbox we enable it immediately so as to handle the
+ * overflow from higher mailboxes.
+ */
+static int ti_hecc_rx_poll(struct napi_struct *napi, int quota)
+{
+	struct net_device *ndev = napi->dev;
+	struct ti_hecc_priv *priv = netdev_priv(ndev);
+	u32 num_pkts = 0;
+	u32 mbx_mask;
+	unsigned long pending_pkts, flags;
+
+	if (!netif_running(ndev))
+		return 0;
+
+	while ((pending_pkts = hecc_read(priv, HECC_CANRMP)) &&
+		num_pkts < quota) {
+		mbx_mask = BIT(priv->rx_next); /* next rx mailbox to process */
+		if (mbx_mask & pending_pkts) {
+			if (ti_hecc_rx_pkt(priv, priv->rx_next) < 0)
+				return num_pkts;
+			++num_pkts;
+		} else if (priv->rx_next > HECC_RX_BUFFER_MBOX) {
+			break; /* pkt not received yet */
+		}
+		--priv->rx_next;
+		if (priv->rx_next == HECC_RX_BUFFER_MBOX) {
+			/* enable high bank mailboxes */
+			spin_lock_irqsave(&priv->mbx_lock, flags);
+			mbx_mask = hecc_read(priv, HECC_CANME);
+			mbx_mask |= HECC_RX_HIGH_MBOX_MASK;
+			hecc_write(priv, HECC_CANME, mbx_mask);
+			spin_unlock_irqrestore(&priv->mbx_lock, flags);
+		} else if (priv->rx_next == HECC_MAX_TX_MBOX - 1) {
+			priv->rx_next = HECC_RX_FIRST_MBOX;
+			break;
+		}
+	}
+
+	/* Enable packet interrupt if all pkts are handled */
+	if (hecc_read(priv, HECC_CANRMP) == 0) {
+		napi_complete(napi);
+		/* Re-enable RX mailbox interrupts */
+		mbx_mask = hecc_read(priv, HECC_CANMIM);
+		mbx_mask |= HECC_TX_MBOX_MASK;
+		hecc_write(priv, HECC_CANMIM, mbx_mask);
+	}
+
+	return num_pkts;
+}
+
+static int ti_hecc_error(struct net_device *ndev, int int_status,
+	int err_status)
+{
+	struct ti_hecc_priv *priv = netdev_priv(ndev);
+	struct net_device_stats *stats = &ndev->stats;
+	struct can_frame *cf;
+	struct sk_buff *skb;
+
+	/* propogate the error condition to the can stack */
+	skb = netdev_alloc_skb(ndev, sizeof(struct can_frame));
+	if (!skb) {
+		if (printk_ratelimit())
+			dev_err(priv->ndev->dev.parent,
+				"ti_hecc_error: netdev_alloc_skb() failed\n");
+		return -ENOMEM;
+	}
+	skb->protocol = __constant_htons(ETH_P_CAN);
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+	cf = (struct can_frame *)skb_put(skb, sizeof(struct can_frame));
+	memset(cf, 0, sizeof(struct can_frame));
+	cf->can_id = CAN_ERR_FLAG;
+	cf->can_dlc = CAN_ERR_DLC;
+
+	if (int_status & HECC_CANGIF_WLIF) { /* warning level int */
+		if ((int_status & HECC_CANGIF_BOIF) == 0) {
+			priv->can.state = CAN_STATE_ERROR_WARNING;
+			++priv->can.can_stats.error_warning;
+			cf->can_id |= CAN_ERR_CRTL;
+			if (hecc_read(priv, HECC_CANTEC) > 96)
+				cf->data[1] |= CAN_ERR_CRTL_TX_WARNING;
+			if (hecc_read(priv, HECC_CANREC) > 96)
+				cf->data[1] |= CAN_ERR_CRTL_RX_WARNING;
+		}
+		hecc_set_bit(priv, HECC_CANES, HECC_CANES_EW);
+		dev_dbg(priv->ndev->dev.parent, "Error Warning interrupt\n");
+		hecc_clear_bit(priv, HECC_CANMC, HECC_CANMC_CCR);
+	}
+
+	if (int_status & HECC_CANGIF_EPIF) { /* error passive int */
+		if ((int_status & HECC_CANGIF_BOIF) == 0) {
+			priv->can.state = CAN_STATE_ERROR_PASSIVE;
+			++priv->can.can_stats.error_passive;
+			cf->can_id |= CAN_ERR_CRTL;
+			if (hecc_read(priv, HECC_CANTEC) > 127)
+				cf->data[1] |= CAN_ERR_CRTL_TX_PASSIVE;
+			if (hecc_read(priv, HECC_CANREC) > 127)
+				cf->data[1] |= CAN_ERR_CRTL_RX_PASSIVE;
+		}
+		hecc_set_bit(priv, HECC_CANES, HECC_CANES_EP);
+		dev_dbg(priv->ndev->dev.parent, "Error passive interrupt\n");
+		hecc_clear_bit(priv, HECC_CANMC, HECC_CANMC_CCR);
+	}
+
+	/*
+	 * Need to check busoff condition in error status register too to
+	 * ensure warning interrupts don't hog the system
+	 */
+	if ((int_status & HECC_CANGIF_BOIF) || (err_status & HECC_CANES_BO)) {
+		priv->can.state = CAN_STATE_BUS_OFF;
+		cf->can_id |= CAN_ERR_BUSOFF;
+		hecc_set_bit(priv, HECC_CANES, HECC_CANES_BO);
+		hecc_clear_bit(priv, HECC_CANMC, HECC_CANMC_CCR);
+		/* Disable all interrupts in bus-off to avoid int hog */
+		hecc_write(priv, HECC_CANGIM, 0);
+		can_bus_off(ndev);
+	}
+
+	if (err_status & HECC_BUS_ERROR) {
+		++priv->can.can_stats.bus_error;
+		cf->can_id |= CAN_ERR_BUSERROR | CAN_ERR_PROT;
+		cf->data[2] |= CAN_ERR_PROT_UNSPEC;
+		if (err_status & HECC_CANES_FE) {
+			hecc_set_bit(priv, HECC_CANES, HECC_CANES_FE);
+			cf->data[2] |= CAN_ERR_PROT_FORM;
+		}
+		if (err_status & HECC_CANES_BE) {
+			hecc_set_bit(priv, HECC_CANES, HECC_CANES_BE);
+			cf->data[2] |= CAN_ERR_PROT_BIT;
+		}
+		if (err_status & HECC_CANES_SE) {
+			hecc_set_bit(priv, HECC_CANES, HECC_CANES_SE);
+			cf->data[2] |= CAN_ERR_PROT_STUFF;
+		}
+		if (err_status & HECC_CANES_CRCE) {
+			hecc_set_bit(priv, HECC_CANES, HECC_CANES_CRCE);
+			cf->data[2] |= CAN_ERR_PROT_LOC_CRC_SEQ |
+					CAN_ERR_PROT_LOC_CRC_DEL;
+		}
+		if (err_status & HECC_CANES_ACKE) {
+			hecc_set_bit(priv, HECC_CANES, HECC_CANES_ACKE);
+			cf->data[2] |= CAN_ERR_PROT_LOC_ACK |
+					CAN_ERR_PROT_LOC_ACK_DEL;
+		}
+	}
+
+	netif_receive_skb(skb);
+	stats->rx_packets++;
+	stats->rx_bytes += cf->can_dlc;
+	return 0;
+}
+
+static irqreturn_t ti_hecc_interrupt(int irq, void *dev_id)
+{
+	struct net_device *ndev = (struct net_device *)dev_id;
+	struct ti_hecc_priv *priv = netdev_priv(ndev);
+	struct net_device_stats *stats = &ndev->stats;
+	u32 mbxno, mbx_mask, int_status, err_status;
+	unsigned long ack, flags;
+
+	int_status = hecc_read(priv,
+		(priv->int_line) ? HECC_CANGIF1 : HECC_CANGIF0);
+
+	if (!int_status)
+		return IRQ_NONE;
+
+	err_status = hecc_read(priv, HECC_CANES);
+	if (err_status & (HECC_BUS_ERROR | HECC_CANES_BO |
+		HECC_CANES_EP | HECC_CANES_EW))
+			ti_hecc_error(ndev, int_status, err_status);
+
+	if (int_status & HECC_CANGIF_GMIF) {
+		while (priv->tx_tail - priv->tx_head > 0) {
+			mbxno = get_tx_tail_mb(priv);
+			mbx_mask = BIT(mbxno);
+			if (!(mbx_mask & hecc_read(priv, HECC_CANTA)))
+				break;
+			hecc_clear_bit(priv, HECC_CANMIM, mbx_mask);
+			hecc_write(priv, HECC_CANTA, mbx_mask);
+			spin_lock_irqsave(&priv->mbx_lock, flags);
+			hecc_clear_bit(priv, HECC_CANME, mbx_mask);
+			spin_unlock_irqrestore(&priv->mbx_lock, flags);
+			stats->tx_bytes += hecc_read_mbx(priv, mbxno,
+						HECC_CANMCF) & 0xF;
+			stats->tx_packets++;
+			can_get_echo_skb(ndev, mbxno);
+			--priv->tx_tail;
+		}
+
+		/* restart queue if wrap-up or if queue stalled on last pkt */
+		if (((priv->tx_head == priv->tx_tail) &&
+		((priv->tx_head & HECC_TX_MASK) != HECC_TX_MASK)) ||
+		(((priv->tx_tail & HECC_TX_MASK) == HECC_TX_MASK) &&
+		((priv->tx_head & HECC_TX_MASK) == HECC_TX_MASK)))
+			netif_wake_queue(ndev);
+
+		/* Disable RX mailbox interrupts and let NAPI reenable them */
+		if (hecc_read(priv, HECC_CANRMP)) {
+			ack = hecc_read(priv, HECC_CANMIM);
+			ack &= BIT(HECC_MAX_TX_MBOX) - 1;
+			hecc_write(priv, HECC_CANMIM, ack);
+			napi_schedule(&priv->napi);
+		}
+	}
+
+	/* clear all interrupt conditions - read back to avoid spurious ints */
+	if (priv->int_line) {
+		hecc_write(priv, HECC_CANGIF1, HECC_SET_REG);
+		int_status = hecc_read(priv, HECC_CANGIF1);
+	} else {
+		hecc_write(priv, HECC_CANGIF0, HECC_SET_REG);
+		int_status = hecc_read(priv, HECC_CANGIF0);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static int ti_hecc_open(struct net_device *ndev)
+{
+	struct ti_hecc_priv *priv = netdev_priv(ndev);
+	int err;
+
+	err = request_irq(ndev->irq, ti_hecc_interrupt, IRQF_SHARED,
+			ndev->name, ndev);
+	if (err) {
+		dev_err(ndev->dev.parent, "error requesting interrupt\n");
+		return err;
+	}
+
+	/* Open common can device */
+	err = open_candev(ndev);
+	if (err) {
+		dev_err(ndev->dev.parent, "open_candev() failed %d\n", err);
+		free_irq(ndev->irq, ndev);
+		return err;
+	}
+
+	clk_enable(priv->clk);
+	ti_hecc_start(ndev);
+	napi_enable(&priv->napi);
+	netif_start_queue(ndev);
+
+	return 0;
+}
+
+static int ti_hecc_close(struct net_device *ndev)
+{
+	struct ti_hecc_priv *priv = netdev_priv(ndev);
+
+	netif_stop_queue(ndev);
+	napi_disable(&priv->napi);
+	ti_hecc_stop(ndev);
+	free_irq(ndev->irq, ndev);
+	clk_disable(priv->clk);
+	close_candev(ndev);
+
+	return 0;
+}
+
+static const struct net_device_ops ti_hecc_netdev_ops = {
+	.ndo_open		= ti_hecc_open,
+	.ndo_stop		= ti_hecc_close,
+	.ndo_start_xmit		= ti_hecc_xmit,
+};
+
+static int ti_hecc_probe(struct platform_device *pdev)
+{
+	struct net_device *ndev = (struct net_device *)0;
+	struct ti_hecc_priv *priv;
+	struct ti_hecc_platform_data *pdata;
+	struct resource *mem, *irq;
+	void __iomem *addr;
+	int err = -ENODEV;
+
+	pdata = pdev->dev.platform_data;
+	if (!pdata) {
+		dev_err(&pdev->dev, "No platform data\n");
+		goto probe_exit;
+	}
+
+	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!mem) {
+		dev_err(&pdev->dev, "No mem resources\n");
+		goto probe_exit;
+	}
+	irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+	if (!irq) {
+		dev_err(&pdev->dev, "No irq resource\n");
+		goto probe_exit;
+	}
+	if (!request_mem_region(mem->start, resource_size(mem), pdev->name)) {
+		dev_err(&pdev->dev, "HECC region already claimed\n");
+		err = -EBUSY;
+		goto probe_exit;
+	}
+	addr = ioremap(mem->start, resource_size(mem));
+	if (!addr) {
+		dev_err(&pdev->dev, "ioremap failed\n");
+		err = -ENOMEM;
+		goto probe_exit_free_region;
+	}
+
+	ndev = alloc_candev(sizeof(struct ti_hecc_priv));
+	if (!ndev) {
+		dev_err(&pdev->dev, "alloc_candev failed\n");
+		err = -ENOMEM;
+		goto probe_exit_iounmap;
+	}
+
+	priv = netdev_priv(ndev);
+	priv->ndev = ndev;
+	priv->base = addr;
+	priv->scc_ram_offset = pdata->scc_ram_offset;
+	priv->hecc_ram_offset = pdata->hecc_ram_offset;
+	priv->mbx_offset = pdata->mbx_offset;
+	priv->int_line = pdata->int_line;
+
+	priv->can.bittiming_const = &ti_hecc_bittiming_const;
+	priv->can.do_set_mode = ti_hecc_do_set_mode;
+	priv->can.do_get_state = ti_hecc_get_state;
+
+	ndev->irq = irq->start;
+	ndev->flags |= IFF_ECHO;
+	platform_set_drvdata(pdev, ndev);
+	SET_NETDEV_DEV(ndev, &pdev->dev);
+	ndev->netdev_ops = &ti_hecc_netdev_ops;
+
+	priv->clk = clk_get(&pdev->dev, "hecc_ck");
+	if (IS_ERR(priv->clk)) {
+		dev_err(&pdev->dev, "No clock available\n");
+		err = PTR_ERR(priv->clk);
+		priv->clk = NULL;
+		goto probe_exit_candev;
+	}
+	priv->can.clock.freq = clk_get_rate(priv->clk);
+	netif_napi_add(ndev, &priv->napi, ti_hecc_rx_poll,
+		HECC_DEF_NAPI_WEIGHT);
+
+	err = register_candev(ndev);
+	if (err) {
+		dev_err(&pdev->dev, "register_candev() failed\n");
+		goto probe_exit_clk;
+	}
+	dev_info(&pdev->dev, "device registered (reg_base=%p, irq=%u)\n",
+		priv->base, (u32) ndev->irq);
+
+	return 0;
+
+probe_exit_clk:
+	clk_put(priv->clk);
+probe_exit_candev:
+	free_candev(ndev);
+probe_exit_iounmap:
+	iounmap(addr);
+probe_exit_free_region:
+	release_mem_region(mem->start, resource_size(mem));
+probe_exit:
+	return err;
+}
+
+static int __devexit ti_hecc_remove(struct platform_device *pdev)
+{
+	struct resource *res;
+	struct net_device *ndev = platform_get_drvdata(pdev);
+	struct ti_hecc_priv *priv = netdev_priv(ndev);
+
+	clk_put(priv->clk);
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	iounmap(priv->base);
+	release_mem_region(res->start, resource_size(res));
+	unregister_candev(ndev);
+	free_candev(ndev);
+	platform_set_drvdata(pdev, NULL);
+
+	return 0;
+}
+
+/* TI HECC netdevice driver: platform driver structure */
+static struct platform_driver ti_hecc_driver = {
+	.driver = {
+		.name    = DRV_NAME,
+		.owner   = THIS_MODULE,
+	},
+	.probe = ti_hecc_probe,
+	.remove = __devexit_p(ti_hecc_remove),
+};
+
+static int __init ti_hecc_init_driver(void)
+{
+	printk(KERN_INFO DRV_DESC "\n");
+	return platform_driver_register(&ti_hecc_driver);
+}
+module_init(ti_hecc_init_driver);
+
+static void __exit ti_hecc_exit_driver(void)
+{
+	printk(KERN_INFO DRV_DESC " unloaded\n");
+	platform_driver_unregister(&ti_hecc_driver);
+}
+module_exit(ti_hecc_exit_driver);
+
+MODULE_AUTHOR("Anant Gole <anantgole@ti.com>");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION(DRV_DESC);
diff --git a/include/linux/can/platform/ti_hecc.h b/include/linux/can/platform/ti_hecc.h
new file mode 100644
index 000000000000..4688c7bb1bd1
--- /dev/null
+++ b/include/linux/can/platform/ti_hecc.h
@@ -0,0 +1,40 @@
+/*
+ * TI HECC (High End CAN Controller) driver platform header
+ *
+ * Copyright (C) 2009 Texas Instruments Incorporated - http://www.ti.com/
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation version 2.
+ *
+ * This program is distributed as is WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+/**
+ * struct hecc_platform_data - HECC Platform Data
+ *
+ * @scc_hecc_offset:	mostly 0 - should really never change
+ * @scc_ram_offset:	SCC RAM offset
+ * @hecc_ram_offset:	HECC RAM offset
+ * @mbx_offset:		Mailbox RAM offset
+ * @int_line:		Interrupt line to use - 0 or 1
+ * @version:		version for future use
+ *
+ * Platform data structure to get all platform specific settings.
+ * this structure also accounts the fact that the IP may have different
+ * RAM and mailbox offsets for different SOC's
+ */
+struct ti_hecc_platform_data {
+	u32 scc_hecc_offset;
+	u32 scc_ram_offset;
+	u32 hecc_ram_offset;
+	u32 mbx_offset;
+	u32 int_line;
+	u32 version;
+};
+
+
-- 
cgit v1.2.3


From e0e6f55d298af03ab88bfe8455b671d29d78f426 Mon Sep 17 00:00:00 2001
From: Jin Dongming <jin.dongming@np.css.fujitsu.com>
Date: Thu, 8 Oct 2009 22:44:47 -0700
Subject: ipv6: Fix the size overflow of addrconf_sysctl array

(This patch fixes bug of commit f7734fdf61ec6bb848e0bafc1fb8bad2c124bb50
 title "make TLLAO option for NA packets configurable")

When the IPV6 conf is used, the function sysctl_set_parent is called and the
array addrconf_sysctl is used as a parameter of the function.

The above patch added new conf "force_tllao" into the array addrconf_sysctl,
but the size of the array was not modified, the static allocated size is
DEVCONF_MAX + 1 but the real size is DEVCONF_MAX + 2, so the problem is
that the function sysctl_set_parent accessed wrong address.

I got the following information.
Call Trace:
    [<ffffffff8106085d>] sysctl_set_parent+0x29/0x3e
    [<ffffffff8106085d>] sysctl_set_parent+0x29/0x3e
    [<ffffffff8106085d>] sysctl_set_parent+0x29/0x3e
    [<ffffffff8106085d>] sysctl_set_parent+0x29/0x3e
    [<ffffffff8106085d>] sysctl_set_parent+0x29/0x3e
    [<ffffffff810622d5>] __register_sysctl_paths+0xde/0x272
    [<ffffffff8110892d>] ? __kmalloc_track_caller+0x16e/0x180
    [<ffffffffa00cfac3>] ? __addrconf_sysctl_register+0xc5/0x144 [ipv6]
    [<ffffffff8141f2c9>] register_net_sysctl_table+0x48/0x4b
    [<ffffffffa00cfaf5>] __addrconf_sysctl_register+0xf7/0x144 [ipv6]
    [<ffffffffa00cfc16>] addrconf_init_net+0xd4/0x104 [ipv6]
    [<ffffffff8139195f>] setup_net+0x35/0x82
    [<ffffffff81391f6c>] copy_net_ns+0x76/0xe0
    [<ffffffff8107ad60>] create_new_namespaces+0xf0/0x16e
    [<ffffffff8107afee>] copy_namespaces+0x65/0x9f
    [<ffffffff81056dff>] copy_process+0xb2c/0x12c3
    [<ffffffff810576e1>] do_fork+0x14b/0x2d2
    [<ffffffff8107ac4e>] ? up_read+0xe/0x10
    [<ffffffff81438e73>] ? do_page_fault+0x27a/0x2aa
    [<ffffffff8101044b>] sys_clone+0x28/0x2a
    [<ffffffff81011fb3>] stub_clone+0x13/0x20
    [<ffffffff81011c72>] ? system_call_fastpath+0x16/0x1b

And the information of IPV6 in .config is as following.
IPV6 in .config:
    CONFIG_IPV6=m
    CONFIG_IPV6_PRIVACY=y
    CONFIG_IPV6_ROUTER_PREF=y
    CONFIG_IPV6_ROUTE_INFO=y
    CONFIG_IPV6_OPTIMISTIC_DAD=y
    CONFIG_IPV6_MIP6=m
    CONFIG_IPV6_SIT=m
    # CONFIG_IPV6_SIT_6RD is not set
    CONFIG_IPV6_NDISC_NODETYPE=y
    CONFIG_IPV6_TUNNEL=m
    CONFIG_IPV6_MULTIPLE_TABLES=y
    CONFIG_IPV6_SUBTREES=y
    CONFIG_IPV6_MROUTE=y
    CONFIG_IPV6_PIMSM_V2=y
    # CONFIG_IP_VS_IPV6 is not set
    CONFIG_NF_CONNTRACK_IPV6=m
    CONFIG_IP6_NF_MATCH_IPV6HEADER=m

I confirmed this patch fixes this problem.

Signed-off-by: Jin Dongming <jin.dongming@np.css.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index ae74ede1abe7..56404251248c 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -208,6 +208,7 @@ enum {
 	DEVCONF_MC_FORWARDING,
 	DEVCONF_DISABLE_IPV6,
 	DEVCONF_ACCEPT_DAD,
+	DEVCONF_FORCE_TLLAO,
 	DEVCONF_MAX
 };
 
-- 
cgit v1.2.3


From 89eda06837094ce9f34fae269b8773fcfd70f046 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Sun, 4 Oct 2009 21:49:47 +0900
Subject: LSM: Add security_path_chmod() and security_path_chown().

This patch allows pathname based LSM modules to check chmod()/chown()
operations. Since notify_change() does not receive "struct vfsmount *",
we add security_path_chmod() and security_path_chown() to the caller of
notify_change().

These hooks are used by TOMOYO.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/open.c                | 24 ++++++++++++++++++++----
 include/linux/security.h | 30 ++++++++++++++++++++++++++++++
 security/capability.c    | 13 +++++++++++++
 security/security.c      | 15 +++++++++++++++
 4 files changed, 78 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/open.c b/fs/open.c
index 4f01e06227c6..b5c294d35bd1 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -616,6 +616,9 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
 	err = mnt_want_write_file(file);
 	if (err)
 		goto out_putf;
+	err = security_path_chmod(dentry, file->f_vfsmnt, mode);
+	if (err)
+		goto out_drop_write;
 	mutex_lock(&inode->i_mutex);
 	if (mode == (mode_t) -1)
 		mode = inode->i_mode;
@@ -623,6 +626,7 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
 	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
 	err = notify_change(dentry, &newattrs);
 	mutex_unlock(&inode->i_mutex);
+out_drop_write:
 	mnt_drop_write(file->f_path.mnt);
 out_putf:
 	fput(file);
@@ -645,6 +649,9 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode)
 	error = mnt_want_write(path.mnt);
 	if (error)
 		goto dput_and_out;
+	error = security_path_chmod(path.dentry, path.mnt, mode);
+	if (error)
+		goto out_drop_write;
 	mutex_lock(&inode->i_mutex);
 	if (mode == (mode_t) -1)
 		mode = inode->i_mode;
@@ -652,6 +659,7 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode)
 	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
 	error = notify_change(path.dentry, &newattrs);
 	mutex_unlock(&inode->i_mutex);
+out_drop_write:
 	mnt_drop_write(path.mnt);
 dput_and_out:
 	path_put(&path);
@@ -700,7 +708,9 @@ SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
 	error = mnt_want_write(path.mnt);
 	if (error)
 		goto out_release;
-	error = chown_common(path.dentry, user, group);
+	error = security_path_chown(&path, user, group);
+	if (!error)
+		error = chown_common(path.dentry, user, group);
 	mnt_drop_write(path.mnt);
 out_release:
 	path_put(&path);
@@ -725,7 +735,9 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
 	error = mnt_want_write(path.mnt);
 	if (error)
 		goto out_release;
-	error = chown_common(path.dentry, user, group);
+	error = security_path_chown(&path, user, group);
+	if (!error)
+		error = chown_common(path.dentry, user, group);
 	mnt_drop_write(path.mnt);
 out_release:
 	path_put(&path);
@@ -744,7 +756,9 @@ SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group
 	error = mnt_want_write(path.mnt);
 	if (error)
 		goto out_release;
-	error = chown_common(path.dentry, user, group);
+	error = security_path_chown(&path, user, group);
+	if (!error)
+		error = chown_common(path.dentry, user, group);
 	mnt_drop_write(path.mnt);
 out_release:
 	path_put(&path);
@@ -767,7 +781,9 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
 		goto out_fput;
 	dentry = file->f_path.dentry;
 	audit_inode(NULL, dentry);
-	error = chown_common(dentry, user, group);
+	error = security_path_chown(&file->f_path, user, group);
+	if (!error)
+		error = chown_common(dentry, user, group);
 	mnt_drop_write(file->f_path.mnt);
 out_fput:
 	fput(file);
diff --git a/include/linux/security.h b/include/linux/security.h
index 239e40d0450b..c8a584c26f7b 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -447,6 +447,18 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@new_dir contains the path structure for parent of the new link.
  *	@new_dentry contains the dentry structure of the new link.
  *	Return 0 if permission is granted.
+ * @path_chmod:
+ *	Check for permission to change DAC's permission of a file or directory.
+ *	@dentry contains the dentry structure.
+ *	@mnt contains the vfsmnt structure.
+ *	@mode contains DAC's mode.
+ *	Return 0 if permission is granted.
+ * @path_chown:
+ *	Check for permission to change owner/group of a file or directory.
+ *	@path contains the path structure.
+ *	@uid contains new owner's ID.
+ *	@gid contains new group's ID.
+ *	Return 0 if permission is granted.
  * @inode_readlink:
  *	Check the permission to read the symbolic link.
  *	@dentry contains the dentry structure for the file link.
@@ -1488,6 +1500,9 @@ struct security_operations {
 			  struct dentry *new_dentry);
 	int (*path_rename) (struct path *old_dir, struct dentry *old_dentry,
 			    struct path *new_dir, struct dentry *new_dentry);
+	int (*path_chmod) (struct dentry *dentry, struct vfsmount *mnt,
+			   mode_t mode);
+	int (*path_chown) (struct path *path, uid_t uid, gid_t gid);
 #endif
 
 	int (*inode_alloc_security) (struct inode *inode);
@@ -2952,6 +2967,9 @@ int security_path_link(struct dentry *old_dentry, struct path *new_dir,
 		       struct dentry *new_dentry);
 int security_path_rename(struct path *old_dir, struct dentry *old_dentry,
 			 struct path *new_dir, struct dentry *new_dentry);
+int security_path_chmod(struct dentry *dentry, struct vfsmount *mnt,
+			mode_t mode);
+int security_path_chown(struct path *path, uid_t uid, gid_t gid);
 #else	/* CONFIG_SECURITY_PATH */
 static inline int security_path_unlink(struct path *dir, struct dentry *dentry)
 {
@@ -3001,6 +3019,18 @@ static inline int security_path_rename(struct path *old_dir,
 {
 	return 0;
 }
+
+static inline int security_path_chmod(struct dentry *dentry,
+				      struct vfsmount *mnt,
+				      mode_t mode)
+{
+	return 0;
+}
+
+static inline int security_path_chown(struct path *path, uid_t uid, gid_t gid)
+{
+	return 0;
+}
 #endif	/* CONFIG_SECURITY_PATH */
 
 #ifdef CONFIG_KEYS
diff --git a/security/capability.c b/security/capability.c
index fce07a7bc825..09279a8d4a14 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -308,6 +308,17 @@ static int cap_path_truncate(struct path *path, loff_t length,
 {
 	return 0;
 }
+
+static int cap_path_chmod(struct dentry *dentry, struct vfsmount *mnt,
+			  mode_t mode)
+{
+	return 0;
+}
+
+static int cap_path_chown(struct path *path, uid_t uid, gid_t gid)
+{
+	return 0;
+}
 #endif
 
 static int cap_file_permission(struct file *file, int mask)
@@ -977,6 +988,8 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, path_link);
 	set_to_cap_if_null(ops, path_rename);
 	set_to_cap_if_null(ops, path_truncate);
+	set_to_cap_if_null(ops, path_chmod);
+	set_to_cap_if_null(ops, path_chown);
 #endif
 	set_to_cap_if_null(ops, file_permission);
 	set_to_cap_if_null(ops, file_alloc_security);
diff --git a/security/security.c b/security/security.c
index c4c673240c1c..5259270e558f 100644
--- a/security/security.c
+++ b/security/security.c
@@ -434,6 +434,21 @@ int security_path_truncate(struct path *path, loff_t length,
 		return 0;
 	return security_ops->path_truncate(path, length, time_attrs);
 }
+
+int security_path_chmod(struct dentry *dentry, struct vfsmount *mnt,
+			mode_t mode)
+{
+	if (unlikely(IS_PRIVATE(dentry->d_inode)))
+		return 0;
+	return security_ops->path_chmod(dentry, mnt, mode);
+}
+
+int security_path_chown(struct path *path, uid_t uid, gid_t gid)
+{
+	if (unlikely(IS_PRIVATE(path->dentry->d_inode)))
+		return 0;
+	return security_ops->path_chown(path, uid, gid);
+}
 #endif
 
 int security_inode_create(struct inode *dir, struct dentry *dentry, int mode)
-- 
cgit v1.2.3


From 8b8efb44033c7e86b3dc76f825c693ec92ae30e9 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Sun, 4 Oct 2009 21:49:48 +0900
Subject: LSM: Add security_path_chroot().

This patch allows pathname based LSM modules to check chroot() operations.

This hook is used by TOMOYO.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/open.c                |  3 +++
 include/linux/security.h | 11 +++++++++++
 security/capability.c    |  6 ++++++
 security/security.c      |  5 +++++
 4 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/fs/open.c b/fs/open.c
index b5c294d35bd1..201041dfca57 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -587,6 +587,9 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
 	error = -EPERM;
 	if (!capable(CAP_SYS_CHROOT))
 		goto dput_and_out;
+	error = security_path_chroot(&path);
+	if (error)
+		goto dput_and_out;
 
 	set_fs_root(current->fs, &path);
 	error = 0;
diff --git a/include/linux/security.h b/include/linux/security.h
index c8a584c26f7b..ed0faea60b82 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -459,6 +459,10 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@uid contains new owner's ID.
  *	@gid contains new group's ID.
  *	Return 0 if permission is granted.
+ * @path_chroot:
+ *	Check for permission to change root directory.
+ *	@path contains the path structure.
+ *	Return 0 if permission is granted.
  * @inode_readlink:
  *	Check the permission to read the symbolic link.
  *	@dentry contains the dentry structure for the file link.
@@ -1503,6 +1507,7 @@ struct security_operations {
 	int (*path_chmod) (struct dentry *dentry, struct vfsmount *mnt,
 			   mode_t mode);
 	int (*path_chown) (struct path *path, uid_t uid, gid_t gid);
+	int (*path_chroot) (struct path *path);
 #endif
 
 	int (*inode_alloc_security) (struct inode *inode);
@@ -2970,6 +2975,7 @@ int security_path_rename(struct path *old_dir, struct dentry *old_dentry,
 int security_path_chmod(struct dentry *dentry, struct vfsmount *mnt,
 			mode_t mode);
 int security_path_chown(struct path *path, uid_t uid, gid_t gid);
+int security_path_chroot(struct path *path);
 #else	/* CONFIG_SECURITY_PATH */
 static inline int security_path_unlink(struct path *dir, struct dentry *dentry)
 {
@@ -3031,6 +3037,11 @@ static inline int security_path_chown(struct path *path, uid_t uid, gid_t gid)
 {
 	return 0;
 }
+
+static inline int security_path_chroot(struct path *path)
+{
+	return 0;
+}
 #endif	/* CONFIG_SECURITY_PATH */
 
 #ifdef CONFIG_KEYS
diff --git a/security/capability.c b/security/capability.c
index 09279a8d4a14..4f3ab476937f 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -319,6 +319,11 @@ static int cap_path_chown(struct path *path, uid_t uid, gid_t gid)
 {
 	return 0;
 }
+
+static int cap_path_chroot(struct path *root)
+{
+	return 0;
+}
 #endif
 
 static int cap_file_permission(struct file *file, int mask)
@@ -990,6 +995,7 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, path_truncate);
 	set_to_cap_if_null(ops, path_chmod);
 	set_to_cap_if_null(ops, path_chown);
+	set_to_cap_if_null(ops, path_chroot);
 #endif
 	set_to_cap_if_null(ops, file_permission);
 	set_to_cap_if_null(ops, file_alloc_security);
diff --git a/security/security.c b/security/security.c
index 5259270e558f..279757314a05 100644
--- a/security/security.c
+++ b/security/security.c
@@ -449,6 +449,11 @@ int security_path_chown(struct path *path, uid_t uid, gid_t gid)
 		return 0;
 	return security_ops->path_chown(path, uid, gid);
 }
+
+int security_path_chroot(struct path *path)
+{
+	return security_ops->path_chroot(path);
+}
 #endif
 
 int security_inode_create(struct inode *dir, struct dentry *dentry, int mode)
-- 
cgit v1.2.3


From d5e63bded6e819ca77ee1a1d97c783a31f6caf30 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 12 Oct 2009 03:00:31 -0700
Subject: Revert "af_packet: add interframe drop cmsg (v6)"

This reverts commit 977750076d98c7ff6cbda51858bb5a5894a9d9ab.

Neil is reimplementing this generically, outside of AF_PACKET.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_packet.h |  2 --
 net/packet/af_packet.c    | 33 ---------------------------------
 2 files changed, 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
index e5d200f53fc3..dea7d6b7cf98 100644
--- a/include/linux/if_packet.h
+++ b/include/linux/if_packet.h
@@ -48,13 +48,11 @@ struct sockaddr_ll
 #define PACKET_RESERVE			12
 #define PACKET_TX_RING			13
 #define PACKET_LOSS			14
-#define PACKET_GAPDATA			15
 
 struct tpacket_stats
 {
 	unsigned int	tp_packets;
 	unsigned int	tp_drops;
-	unsigned int    tp_gap;
 };
 
 struct tpacket_auxdata
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 70073a0dea5d..f87ed4803c11 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -524,31 +524,6 @@ static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
 	return res;
 }
 
-/*
- * If we've lost frames since the last time we queued one to the
- * sk_receive_queue, we need to record it here.
- * This must be called under the protection of the socket lock
- * to prevent racing with other softirqs and user space
- */
-static inline void record_packet_gap(struct sk_buff *skb,
-					struct packet_sock *po)
-{
-	/*
-	 * We overload the mark field here, since we're about
-	 * to enqueue to a receive queue and no body else will
-	 * use this field at this point
-	 */
-	skb->mark = po->stats.tp_gap;
-	po->stats.tp_gap = 0;
-	return;
-
-}
-
-static inline __u32 check_packet_gap(struct sk_buff *skb)
-{
-	return skb->mark;
-}
-
 /*
    This function makes lazy skb cloning in hope that most of packets
    are discarded by BPF.
@@ -652,7 +627,6 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
 
 	spin_lock(&sk->sk_receive_queue.lock);
 	po->stats.tp_packets++;
-	record_packet_gap(skb, po);
 	__skb_queue_tail(&sk->sk_receive_queue, skb);
 	spin_unlock(&sk->sk_receive_queue.lock);
 	sk->sk_data_ready(sk, skb->len);
@@ -661,7 +635,6 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
 drop_n_acct:
 	spin_lock(&sk->sk_receive_queue.lock);
 	po->stats.tp_drops++;
-	po->stats.tp_gap++;
 	spin_unlock(&sk->sk_receive_queue.lock);
 
 drop_n_restore:
@@ -839,7 +812,6 @@ drop:
 
 ring_is_full:
 	po->stats.tp_drops++;
-	po->stats.tp_gap++;
 	spin_unlock(&sk->sk_receive_queue.lock);
 
 	sk->sk_data_ready(sk, 0);
@@ -1449,7 +1421,6 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 	struct sk_buff *skb;
 	int copied, err;
 	struct sockaddr_ll *sll;
-	__u32 gap;
 
 	err = -EINVAL;
 	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
@@ -1528,10 +1499,6 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
 	}
 
-	gap = check_packet_gap(skb);
-	if (gap)
-		put_cmsg(msg, SOL_PACKET, PACKET_GAPDATA, sizeof(__u32), &gap);
-
 	/*
 	 *	Free or return the buffer as appropriate. Again this
 	 *	hides all the races and re-entrancy issues from us.
-- 
cgit v1.2.3


From 3b885787ea4112eaa80945999ea0901bf742707f Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Mon, 12 Oct 2009 13:26:31 -0700
Subject: net: Generalize socket rx gap / receive queue overflow cmsg

Create a new socket level option to report number of queue overflows

Recently I augmented the AF_PACKET protocol to report the number of frames lost
on the socket receive queue between any two enqueued frames.  This value was
exported via a SOL_PACKET level cmsg.  AFter I completed that work it was
requested that this feature be generalized so that any datagram oriented socket
could make use of this option.  As such I've created this patch, It creates a
new SOL_SOCKET level option called SO_RXQ_OVFL, which when enabled exports a
SOL_SOCKET level cmsg that reports the nubmer of times the sk_receive_queue
overflowed between any two given frames.  It also augments the AF_PACKET
protocol to take advantage of this new feature (as it previously did not touch
sk->sk_drops, which this patch uses to record the overflow count).  Tested
successfully by me.

Notes:

1) Unlike my previous patch, this patch simply records the sk_drops value, which
is not a number of drops between packets, but rather a total number of drops.
Deltas must be computed in user space.

2) While this patch currently works with datagram oriented protocols, it will
also be accepted by non-datagram oriented protocols. I'm not sure if thats
agreeable to everyone, but my argument in favor of doing so is that, for those
protocols which aren't applicable to this option, sk_drops will always be zero,
and reporting no drops on a receive queue that isn't used for those
non-participating protocols seems reasonable to me.  This also saves us having
to code in a per-protocol opt in mechanism.

3) This applies cleanly to net-next assuming that commit
977750076d98c7ff6cbda51858bb5a5894a9d9ab (my af packet cmsg patch) is reverted

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/asm/socket.h   |  2 ++
 arch/arm/include/asm/socket.h     |  2 ++
 arch/avr32/include/asm/socket.h   |  2 ++
 arch/cris/include/asm/socket.h    |  2 ++
 arch/frv/include/asm/socket.h     |  2 ++
 arch/h8300/include/asm/socket.h   |  2 ++
 arch/ia64/include/asm/socket.h    |  2 ++
 arch/m32r/include/asm/socket.h    |  2 ++
 arch/m68k/include/asm/socket.h    |  2 ++
 arch/mips/include/asm/socket.h    |  2 ++
 arch/mn10300/include/asm/socket.h |  2 ++
 arch/parisc/include/asm/socket.h  |  2 ++
 arch/powerpc/include/asm/socket.h |  2 ++
 arch/s390/include/asm/socket.h    |  2 ++
 arch/sparc/include/asm/socket.h   |  2 ++
 arch/xtensa/include/asm/socket.h  |  2 ++
 include/asm-generic/socket.h      |  1 +
 include/linux/skbuff.h            |  6 ++++--
 include/net/sock.h                |  3 +++
 net/atm/common.c                  |  2 +-
 net/bluetooth/af_bluetooth.c      |  2 +-
 net/bluetooth/rfcomm/sock.c       |  2 +-
 net/can/bcm.c                     |  2 +-
 net/can/raw.c                     |  2 +-
 net/core/sock.c                   | 17 ++++++++++++++++-
 net/ieee802154/dgram.c            |  2 +-
 net/ieee802154/raw.c              |  2 +-
 net/ipv4/raw.c                    |  2 +-
 net/ipv4/udp.c                    |  2 +-
 net/ipv6/raw.c                    |  2 +-
 net/ipv6/udp.c                    |  2 +-
 net/key/af_key.c                  |  2 +-
 net/packet/af_packet.c            |  7 +++----
 net/rxrpc/ar-recvmsg.c            |  2 +-
 net/sctp/socket.c                 |  2 +-
 net/socket.c                      | 15 +++++++++++++++
 36 files changed, 88 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/socket.h b/arch/alpha/include/asm/socket.h
index 26773e3246e2..06edfefc3373 100644
--- a/arch/alpha/include/asm/socket.h
+++ b/arch/alpha/include/asm/socket.h
@@ -67,6 +67,8 @@
 #define SO_TIMESTAMPING		37
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
+#define SO_RXQ_OVFL             40
+
 /* O_NONBLOCK clashes with the bits used for socket types.  Therefore we
  * have to define SOCK_NONBLOCK to a different value here.
  */
diff --git a/arch/arm/include/asm/socket.h b/arch/arm/include/asm/socket.h
index 92ac61d294fd..90ffd04b8e74 100644
--- a/arch/arm/include/asm/socket.h
+++ b/arch/arm/include/asm/socket.h
@@ -60,4 +60,6 @@
 #define SO_PROTOCOL		38
 #define SO_DOMAIN		39
 
+#define SO_RXQ_OVFL             40
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/avr32/include/asm/socket.h b/arch/avr32/include/asm/socket.h
index fe863f9794d5..c8d1fae49476 100644
--- a/arch/avr32/include/asm/socket.h
+++ b/arch/avr32/include/asm/socket.h
@@ -60,4 +60,6 @@
 #define SO_PROTOCOL		38
 #define SO_DOMAIN		39
 
+#define SO_RXQ_OVFL             40
+
 #endif /* __ASM_AVR32_SOCKET_H */
diff --git a/arch/cris/include/asm/socket.h b/arch/cris/include/asm/socket.h
index 45ec49bdb7b1..1a4a61909ca8 100644
--- a/arch/cris/include/asm/socket.h
+++ b/arch/cris/include/asm/socket.h
@@ -62,6 +62,8 @@
 #define SO_PROTOCOL		38
 #define SO_DOMAIN		39
 
+#define SO_RXQ_OVFL             40
+
 #endif /* _ASM_SOCKET_H */
 
 
diff --git a/arch/frv/include/asm/socket.h b/arch/frv/include/asm/socket.h
index 2dea726095c2..a6b26880c1ec 100644
--- a/arch/frv/include/asm/socket.h
+++ b/arch/frv/include/asm/socket.h
@@ -60,5 +60,7 @@
 #define SO_PROTOCOL		38
 #define SO_DOMAIN		39
 
+#define SO_RXQ_OVFL             40
+
 #endif /* _ASM_SOCKET_H */
 
diff --git a/arch/h8300/include/asm/socket.h b/arch/h8300/include/asm/socket.h
index 1547f01c8e22..04c0f4596eb5 100644
--- a/arch/h8300/include/asm/socket.h
+++ b/arch/h8300/include/asm/socket.h
@@ -60,4 +60,6 @@
 #define SO_PROTOCOL		38
 #define SO_DOMAIN		39
 
+#define SO_RXQ_OVFL             40
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/ia64/include/asm/socket.h b/arch/ia64/include/asm/socket.h
index 0b0d5ff062e5..51427eaa51ba 100644
--- a/arch/ia64/include/asm/socket.h
+++ b/arch/ia64/include/asm/socket.h
@@ -69,4 +69,6 @@
 #define SO_PROTOCOL		38
 #define SO_DOMAIN		39
 
+#define SO_RXQ_OVFL             40
+
 #endif /* _ASM_IA64_SOCKET_H */
diff --git a/arch/m32r/include/asm/socket.h b/arch/m32r/include/asm/socket.h
index 3390a864f224..469787c30098 100644
--- a/arch/m32r/include/asm/socket.h
+++ b/arch/m32r/include/asm/socket.h
@@ -60,4 +60,6 @@
 #define SO_PROTOCOL		38
 #define SO_DOMAIN		39
 
+#define SO_RXQ_OVFL             40
+
 #endif /* _ASM_M32R_SOCKET_H */
diff --git a/arch/m68k/include/asm/socket.h b/arch/m68k/include/asm/socket.h
index eee01cce921b..9bf49c87d954 100644
--- a/arch/m68k/include/asm/socket.h
+++ b/arch/m68k/include/asm/socket.h
@@ -60,4 +60,6 @@
 #define SO_PROTOCOL		38
 #define SO_DOMAIN		39
 
+#define SO_RXQ_OVFL             40
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/mips/include/asm/socket.h b/arch/mips/include/asm/socket.h
index ae05accd9fe4..9de5190f2487 100644
--- a/arch/mips/include/asm/socket.h
+++ b/arch/mips/include/asm/socket.h
@@ -80,6 +80,8 @@ To add: #define SO_REUSEPORT 0x0200	/* Allow local address and port reuse.  */
 #define SO_TIMESTAMPING		37
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
+#define SO_RXQ_OVFL             40
+
 #ifdef __KERNEL__
 
 /** sock_type - Socket types
diff --git a/arch/mn10300/include/asm/socket.h b/arch/mn10300/include/asm/socket.h
index 4df75af29d76..4e60c4281288 100644
--- a/arch/mn10300/include/asm/socket.h
+++ b/arch/mn10300/include/asm/socket.h
@@ -60,4 +60,6 @@
 #define SO_PROTOCOL		38
 #define SO_DOMAIN		39
 
+#define SO_RXQ_OVFL             40
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/parisc/include/asm/socket.h b/arch/parisc/include/asm/socket.h
index 960b1e5d8e16..225b7d6a1a0a 100644
--- a/arch/parisc/include/asm/socket.h
+++ b/arch/parisc/include/asm/socket.h
@@ -59,6 +59,8 @@
 #define SO_TIMESTAMPING		0x4020
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
+#define SO_RXQ_OVFL             0x4021
+
 /* O_NONBLOCK clashes with the bits used for socket types.  Therefore we
  * have to define SOCK_NONBLOCK to a different value here.
  */
diff --git a/arch/powerpc/include/asm/socket.h b/arch/powerpc/include/asm/socket.h
index 3ab8b3e6feb0..866f7606da68 100644
--- a/arch/powerpc/include/asm/socket.h
+++ b/arch/powerpc/include/asm/socket.h
@@ -67,4 +67,6 @@
 #define SO_PROTOCOL		38
 #define SO_DOMAIN		39
 
+#define SO_RXQ_OVFL             40
+
 #endif	/* _ASM_POWERPC_SOCKET_H */
diff --git a/arch/s390/include/asm/socket.h b/arch/s390/include/asm/socket.h
index e42df89a0b85..fdff1e995c73 100644
--- a/arch/s390/include/asm/socket.h
+++ b/arch/s390/include/asm/socket.h
@@ -68,4 +68,6 @@
 #define SO_PROTOCOL		38
 #define SO_DOMAIN		39
 
+#define SO_RXQ_OVFL             40
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/sparc/include/asm/socket.h b/arch/sparc/include/asm/socket.h
index 3a5ae3d12088..9d3fefcff2f5 100644
--- a/arch/sparc/include/asm/socket.h
+++ b/arch/sparc/include/asm/socket.h
@@ -56,6 +56,8 @@
 #define SO_TIMESTAMPING		0x0023
 #define SCM_TIMESTAMPING	SO_TIMESTAMPING
 
+#define SO_RXQ_OVFL             0x0024
+
 /* Security levels - as per NRL IPv6 - don't actually do anything */
 #define SO_SECURITY_AUTHENTICATION		0x5001
 #define SO_SECURITY_ENCRYPTION_TRANSPORT	0x5002
diff --git a/arch/xtensa/include/asm/socket.h b/arch/xtensa/include/asm/socket.h
index beb3a6bdb61d..cbdf2ffaacff 100644
--- a/arch/xtensa/include/asm/socket.h
+++ b/arch/xtensa/include/asm/socket.h
@@ -71,4 +71,6 @@
 #define SO_PROTOCOL		38
 #define SO_DOMAIN		39
 
+#define SO_RXQ_OVFL             40
+
 #endif	/* _XTENSA_SOCKET_H */
diff --git a/include/asm-generic/socket.h b/include/asm-generic/socket.h
index 538991cef6f0..9a6115e7cf63 100644
--- a/include/asm-generic/socket.h
+++ b/include/asm-generic/socket.h
@@ -63,4 +63,5 @@
 #define SO_PROTOCOL		38
 #define SO_DOMAIN		39
 
+#define SO_RXQ_OVFL             40
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index df7b23ac66e6..8c866b5cb97b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -389,8 +389,10 @@ struct sk_buff {
 #ifdef CONFIG_NETWORK_SECMARK
 	__u32			secmark;
 #endif
-
-	__u32			mark;
+	union {
+		__u32		mark;
+		__u32		dropcount;
+	};
 
 	__u16			vlan_tci;
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 98398bdec57d..10669b01eeab 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -505,6 +505,7 @@ enum sock_flags {
 	SOCK_TIMESTAMPING_RAW_HARDWARE, /* %SOF_TIMESTAMPING_RAW_HARDWARE */
 	SOCK_TIMESTAMPING_SYS_HARDWARE, /* %SOF_TIMESTAMPING_SYS_HARDWARE */
 	SOCK_FASYNC, /* fasync() active */
+	SOCK_RXQ_OVFL,
 };
 
 static inline void sock_copy_flags(struct sock *nsk, struct sock *osk)
@@ -1493,6 +1494,8 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
 		sk->sk_stamp = kt;
 }
 
+extern void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, struct sk_buff *skb);
+
 /**
  * sock_tx_timestamp - checks whether the outgoing packet is to be time stamped
  * @msg:	outgoing packet
diff --git a/net/atm/common.c b/net/atm/common.c
index 950bd16d2383..d61e051e0a3f 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -496,7 +496,7 @@ int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 	error = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
 	if (error)
 		return error;
-	sock_recv_timestamp(msg, sk, skb);
+	sock_recv_ts_and_drops(msg, sk, skb);
 	pr_debug("RcvM %d -= %d\n", atomic_read(&sk->sk_rmem_alloc), skb->truesize);
 	atm_return(vcc, skb->truesize);
 	skb_free_datagram(sk, skb);
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 1f6e49c1cde8..399e59c9c6cb 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -257,7 +257,7 @@ int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 	skb_reset_transport_header(skb);
 	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
 	if (err == 0)
-		sock_recv_timestamp(msg, sk, skb);
+		sock_recv_ts_and_drops(msg, sk, skb);
 
 	skb_free_datagram(sk, skb);
 
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index c70786503850..d3bfc1b0afb1 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -703,7 +703,7 @@ static int rfcomm_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 		copied += chunk;
 		size   -= chunk;
 
-		sock_recv_timestamp(msg, sk, skb);
+		sock_recv_ts_and_drops(msg, sk, skb);
 
 		if (!(flags & MSG_PEEK)) {
 			atomic_sub(chunk, &sk->sk_rmem_alloc);
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 597da4f8f888..2f47039c79dd 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -1534,7 +1534,7 @@ static int bcm_recvmsg(struct kiocb *iocb, struct socket *sock,
 		return err;
 	}
 
-	sock_recv_timestamp(msg, sk, skb);
+	sock_recv_ts_and_drops(msg, sk, skb);
 
 	if (msg->msg_name) {
 		msg->msg_namelen = sizeof(struct sockaddr_can);
diff --git a/net/can/raw.c b/net/can/raw.c
index b5e897922d32..962fc9f1d0c7 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -702,7 +702,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct socket *sock,
 		return err;
 	}
 
-	sock_recv_timestamp(msg, sk, skb);
+	sock_recv_ts_and_drops(msg, sk, skb);
 
 	if (msg->msg_name) {
 		msg->msg_namelen = sizeof(struct sockaddr_can);
diff --git a/net/core/sock.c b/net/core/sock.c
index 7626b6aacd68..43ca2c995393 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -276,6 +276,8 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	int err = 0;
 	int skb_len;
+	unsigned long flags;
+	struct sk_buff_head *list = &sk->sk_receive_queue;
 
 	/* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
 	   number of warnings when compiling with -W --ANK
@@ -305,7 +307,10 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	 */
 	skb_len = skb->len;
 
-	skb_queue_tail(&sk->sk_receive_queue, skb);
+	spin_lock_irqsave(&list->lock, flags);
+	skb->dropcount = atomic_read(&sk->sk_drops);
+	__skb_queue_tail(list, skb);
+	spin_unlock_irqrestore(&list->lock, flags);
 
 	if (!sock_flag(sk, SOCK_DEAD))
 		sk->sk_data_ready(sk, skb_len);
@@ -702,6 +707,12 @@ set_rcvbuf:
 
 		/* We implement the SO_SNDLOWAT etc to
 		   not be settable (1003.1g 5.3) */
+	case SO_RXQ_OVFL:
+		if (valbool)
+			sock_set_flag(sk, SOCK_RXQ_OVFL);
+		else
+			sock_reset_flag(sk, SOCK_RXQ_OVFL);
+		break;
 	default:
 		ret = -ENOPROTOOPT;
 		break;
@@ -901,6 +912,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		v.val = sk->sk_mark;
 		break;
 
+	case SO_RXQ_OVFL:
+		v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
+		break;
+
 	default:
 		return -ENOPROTOOPT;
 	}
diff --git a/net/ieee802154/dgram.c b/net/ieee802154/dgram.c
index a413b1bf4465..25ad956a39d8 100644
--- a/net/ieee802154/dgram.c
+++ b/net/ieee802154/dgram.c
@@ -303,7 +303,7 @@ static int dgram_recvmsg(struct kiocb *iocb, struct sock *sk,
 	if (err)
 		goto done;
 
-	sock_recv_timestamp(msg, sk, skb);
+	sock_recv_ts_and_drops(msg, sk, skb);
 
 	if (flags & MSG_TRUNC)
 		copied = skb->len;
diff --git a/net/ieee802154/raw.c b/net/ieee802154/raw.c
index 30e74eee07d6..769c8d138fc3 100644
--- a/net/ieee802154/raw.c
+++ b/net/ieee802154/raw.c
@@ -191,7 +191,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	if (err)
 		goto done;
 
-	sock_recv_timestamp(msg, sk, skb);
+	sock_recv_ts_and_drops(msg, sk, skb);
 
 	if (flags & MSG_TRUNC)
 		copied = skb->len;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 757c9171e7c2..f18172b07611 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -682,7 +682,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	if (err)
 		goto done;
 
-	sock_recv_timestamp(msg, sk, skb);
+	sock_recv_ts_and_drops(msg, sk, skb);
 
 	/* Copy the address. */
 	if (sin) {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 194bcdc6d9fc..71e5353b30c8 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -955,7 +955,7 @@ try_again:
 		UDP_INC_STATS_USER(sock_net(sk),
 				UDP_MIB_INDATAGRAMS, is_udplite);
 
-	sock_recv_timestamp(msg, sk, skb);
+	sock_recv_ts_and_drops(msg, sk, skb);
 
 	/* Copy the address. */
 	if (sin) {
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 4f24570b0869..d8375bc7f2d5 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -497,7 +497,7 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk,
 			sin6->sin6_scope_id = IP6CB(skb)->iif;
 	}
 
-	sock_recv_timestamp(msg, sk, skb);
+	sock_recv_ts_and_drops(msg, sk, skb);
 
 	if (np->rxopt.all)
 		datagram_recv_ctl(sk, msg, skb);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index ff778c172ef2..1f8e2afa4490 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -252,7 +252,7 @@ try_again:
 					UDP_MIB_INDATAGRAMS, is_udplite);
 	}
 
-	sock_recv_timestamp(msg, sk, skb);
+	sock_recv_ts_and_drops(msg, sk, skb);
 
 	/* Copy the address. */
 	if (msg->msg_name) {
diff --git a/net/key/af_key.c b/net/key/af_key.c
index c078ae6e975b..472f6594184a 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -3606,7 +3606,7 @@ static int pfkey_recvmsg(struct kiocb *kiocb,
 	if (err)
 		goto out_free;
 
-	sock_recv_timestamp(msg, sk, skb);
+	sock_recv_ts_and_drops(msg, sk, skb);
 
 	err = (flags & MSG_TRUNC) ? skb->len : copied;
 
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index f87ed4803c11..bf3a2954cd4d 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -627,15 +627,14 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
 
 	spin_lock(&sk->sk_receive_queue.lock);
 	po->stats.tp_packets++;
+	skb->dropcount = atomic_read(&sk->sk_drops);
 	__skb_queue_tail(&sk->sk_receive_queue, skb);
 	spin_unlock(&sk->sk_receive_queue.lock);
 	sk->sk_data_ready(sk, skb->len);
 	return 0;
 
 drop_n_acct:
-	spin_lock(&sk->sk_receive_queue.lock);
-	po->stats.tp_drops++;
-	spin_unlock(&sk->sk_receive_queue.lock);
+	po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
 
 drop_n_restore:
 	if (skb_head != skb->data && skb_shared(skb)) {
@@ -1478,7 +1477,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 	if (err)
 		goto out_free;
 
-	sock_recv_timestamp(msg, sk, skb);
+	sock_recv_ts_and_drops(msg, sk, skb);
 
 	if (msg->msg_name)
 		memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c
index a39bf97f8830..60c2b94e6b54 100644
--- a/net/rxrpc/ar-recvmsg.c
+++ b/net/rxrpc/ar-recvmsg.c
@@ -146,7 +146,7 @@ int rxrpc_recvmsg(struct kiocb *iocb, struct socket *sock,
 				memcpy(msg->msg_name,
 				       &call->conn->trans->peer->srx,
 				       sizeof(call->conn->trans->peer->srx));
-			sock_recv_timestamp(msg, &rx->sk, skb);
+			sock_recv_ts_and_drops(msg, &rx->sk, skb);
 		}
 
 		/* receive the message */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index c8d05758661d..0970e92c6acd 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1958,7 +1958,7 @@ SCTP_STATIC int sctp_recvmsg(struct kiocb *iocb, struct sock *sk,
 	if (err)
 		goto out_free;
 
-	sock_recv_timestamp(msg, sk, skb);
+	sock_recv_ts_and_drops(msg, sk, skb);
 	if (sctp_ulpevent_is_notification(event)) {
 		msg->msg_flags |= MSG_NOTIFICATION;
 		sp->pf->event_msgname(event, msg->msg_name, addr_len);
diff --git a/net/socket.c b/net/socket.c
index 954f3381cc8a..807935693846 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -668,6 +668,21 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 
 EXPORT_SYMBOL_GPL(__sock_recv_timestamp);
 
+inline void sock_recv_drops(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
+{
+	if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && skb->dropcount)
+		put_cmsg(msg, SOL_SOCKET, SO_RXQ_OVFL,
+			sizeof(__u32), &skb->dropcount);
+}
+
+void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
+	struct sk_buff *skb)
+{
+	sock_recv_timestamp(msg, sk, skb);
+	sock_recv_drops(msg, sk, skb);
+}
+EXPORT_SYMBOL_GPL(sock_recv_ts_and_drops);
+
 static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 				 struct msghdr *msg, size_t size, int flags)
 {
-- 
cgit v1.2.3


From a2e2725541fad72416326798c2d7fa4dafb7d337 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 12 Oct 2009 23:40:10 -0700
Subject: net: Introduce recvmmsg socket syscall
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Meaning receive multiple messages, reducing the number of syscalls and
net stack entry/exit operations.

Next patches will introduce mechanisms where protocols that want to
optimize this operation will provide an unlocked_recvmsg operation.

This takes into account comments made by:

. Paul Moore: sock_recvmsg is called only for the first datagram,
  sock_recvmsg_nosec is used for the rest.

. Caitlin Bestler: recvmmsg now has a struct timespec timeout, that
  works in the same fashion as the ppoll one.

  If the underlying protocol returns a datagram with MSG_OOB set, this
  will make recvmmsg return right away with as many datagrams (+ the OOB
  one) it has received so far.

. Rémi Denis-Courmont & Steven Whitehouse: If we receive N < vlen
  datagrams and then recvmsg returns an error, recvmmsg will return
  the successfully received datagrams, store the error and return it
  in the next call.

This paves the way for a subsequent optimization, sk_prot->unlocked_recvmsg,
where we will be able to acquire the lock only at batch start and end, not at
every underlying recvmsg call.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/kernel/systbls.S            |   1 +
 arch/arm/kernel/calls.S                |   1 +
 arch/avr32/kernel/syscall_table.S      |   1 +
 arch/blackfin/mach-common/entry.S      |   1 +
 arch/ia64/kernel/entry.S               |   1 +
 arch/microblaze/kernel/syscall_table.S |   1 +
 arch/mips/kernel/scall32-o32.S         |   1 +
 arch/mips/kernel/scall64-64.S          |   1 +
 arch/mips/kernel/scall64-n32.S         |   1 +
 arch/mips/kernel/scall64-o32.S         |   1 +
 arch/sh/kernel/syscalls_64.S           |   1 +
 arch/sparc/kernel/systbls_32.S         |   2 +-
 arch/sparc/kernel/systbls_64.S         |   4 +-
 arch/x86/ia32/ia32entry.S              |   1 +
 arch/x86/include/asm/unistd_32.h       |   3 +-
 arch/x86/include/asm/unistd_64.h       |   2 +
 arch/x86/kernel/syscall_table_32.S     |   1 +
 arch/xtensa/include/asm/unistd.h       |   4 +-
 include/linux/net.h                    |   1 +
 include/linux/socket.h                 |  10 ++
 include/linux/syscalls.h               |   4 +
 include/net/compat.h                   |   8 ++
 kernel/sys_ni.c                        |   2 +
 net/compat.c                           |  33 ++++-
 net/socket.c                           | 225 +++++++++++++++++++++++++++------
 25 files changed, 261 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/systbls.S b/arch/alpha/kernel/systbls.S
index 95c9aef1c106..cda6b8b3d573 100644
--- a/arch/alpha/kernel/systbls.S
+++ b/arch/alpha/kernel/systbls.S
@@ -497,6 +497,7 @@ sys_call_table:
 	.quad sys_signalfd
 	.quad sys_ni_syscall
 	.quad sys_eventfd
+	.quad sys_recvmmsg
 
 	.size sys_call_table, . - sys_call_table
 	.type sys_call_table, @object
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
index fafce1b5c69f..f58c1156e779 100644
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -374,6 +374,7 @@
 		CALL(sys_pwritev)
 		CALL(sys_rt_tgsigqueueinfo)
 		CALL(sys_perf_event_open)
+/* 365 */	CALL(sys_recvmmsg)
 #ifndef syscalls_counted
 .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
 #define syscalls_counted
diff --git a/arch/avr32/kernel/syscall_table.S b/arch/avr32/kernel/syscall_table.S
index 7ee0057613b3..e76bad16b0f0 100644
--- a/arch/avr32/kernel/syscall_table.S
+++ b/arch/avr32/kernel/syscall_table.S
@@ -295,4 +295,5 @@ sys_call_table:
 	.long	sys_signalfd
 	.long	sys_ni_syscall		/* 280, was sys_timerfd */
 	.long	sys_eventfd
+	.long	sys_recvmmsg
 	.long	sys_ni_syscall		/* r8 is saturated at nr_syscalls */
diff --git a/arch/blackfin/mach-common/entry.S b/arch/blackfin/mach-common/entry.S
index 1e7cac23e25f..48692724b74c 100644
--- a/arch/blackfin/mach-common/entry.S
+++ b/arch/blackfin/mach-common/entry.S
@@ -1621,6 +1621,7 @@ ENTRY(_sys_call_table)
 	.long _sys_pwritev
 	.long _sys_rt_tgsigqueueinfo
 	.long _sys_perf_event_open
+	.long _sys_recvmmsg		/* 370 */
 
 	.rept NR_syscalls-(.-_sys_call_table)/4
 	.long _sys_ni_syscall
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index d0e7d37017b4..d75b872ca4dc 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1806,6 +1806,7 @@ sys_call_table:
 	data8 sys_preadv
 	data8 sys_pwritev			// 1320
 	data8 sys_rt_tgsigqueueinfo
+	data8 sys_recvmmsg
 
 	.org sys_call_table + 8*NR_syscalls	// guard against failures to increase NR_syscalls
 #endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */
diff --git a/arch/microblaze/kernel/syscall_table.S b/arch/microblaze/kernel/syscall_table.S
index ecec19155135..c1ab1dc10898 100644
--- a/arch/microblaze/kernel/syscall_table.S
+++ b/arch/microblaze/kernel/syscall_table.S
@@ -371,3 +371,4 @@ ENTRY(sys_call_table)
 	.long sys_ni_syscall
 	.long sys_rt_tgsigqueueinfo	/* 365 */
 	.long sys_perf_event_open
+	.long sys_recvmmsg
diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S
index fd2a9bb620d6..17202bbe843f 100644
--- a/arch/mips/kernel/scall32-o32.S
+++ b/arch/mips/kernel/scall32-o32.S
@@ -583,6 +583,7 @@ einval:	li	v0, -ENOSYS
 	sys	sys_rt_tgsigqueueinfo	4
 	sys	sys_perf_event_open	5
 	sys	sys_accept4		4
+	sys     sys_recvmmsg            5
 	.endm
 
 	/* We pre-compute the number of _instruction_ bytes needed to
diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S
index 18bf7f32c5e4..a8a6c596eb04 100644
--- a/arch/mips/kernel/scall64-64.S
+++ b/arch/mips/kernel/scall64-64.S
@@ -420,4 +420,5 @@ sys_call_table:
 	PTR	sys_rt_tgsigqueueinfo
 	PTR	sys_perf_event_open
 	PTR	sys_accept4
+	PTR     sys_recvmmsg
 	.size	sys_call_table,.-sys_call_table
diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S
index 6ebc07976694..5154e64f7cfe 100644
--- a/arch/mips/kernel/scall64-n32.S
+++ b/arch/mips/kernel/scall64-n32.S
@@ -418,4 +418,5 @@ EXPORT(sysn32_call_table)
 	PTR	compat_sys_rt_tgsigqueueinfo	/* 5295 */
 	PTR	sys_perf_event_open
 	PTR	sys_accept4
+	PTR     compat_sys_recvmmsg
 	.size	sysn32_call_table,.-sysn32_call_table
diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S
index 9bbf9775e0bd..d0eff53d7cb9 100644
--- a/arch/mips/kernel/scall64-o32.S
+++ b/arch/mips/kernel/scall64-o32.S
@@ -538,4 +538,5 @@ sys_call_table:
 	PTR	compat_sys_rt_tgsigqueueinfo
 	PTR	sys_perf_event_open
 	PTR	sys_accept4
+	PTR     compat_sys_recvmmsg
 	.size	sys_call_table,.-sys_call_table
diff --git a/arch/sh/kernel/syscalls_64.S b/arch/sh/kernel/syscalls_64.S
index 5bfde6c77498..07d2aaea9ae8 100644
--- a/arch/sh/kernel/syscalls_64.S
+++ b/arch/sh/kernel/syscalls_64.S
@@ -391,3 +391,4 @@ sys_call_table:
 	.long sys_pwritev
 	.long sys_rt_tgsigqueueinfo
 	.long sys_perf_event_open
+	.long sys_recvmmsg		/* 365 */
diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S
index 0f1658d37490..ceb1530f8aa6 100644
--- a/arch/sparc/kernel/systbls_32.S
+++ b/arch/sparc/kernel/systbls_32.S
@@ -82,5 +82,5 @@ sys_call_table:
 /*310*/	.long sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate
 /*315*/	.long sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1
 /*320*/	.long sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv
-/*325*/	.long sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open
+/*325*/	.long sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open, sys_recvmmsg
 
diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S
index 009825f6e73c..f37bef747e60 100644
--- a/arch/sparc/kernel/systbls_64.S
+++ b/arch/sparc/kernel/systbls_64.S
@@ -83,7 +83,7 @@ sys_call_table32:
 /*310*/	.word compat_sys_utimensat, compat_sys_signalfd, sys_timerfd_create, sys_eventfd, compat_sys_fallocate
 	.word compat_sys_timerfd_settime, compat_sys_timerfd_gettime, compat_sys_signalfd4, sys_eventfd2, sys_epoll_create1
 /*320*/	.word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, compat_sys_preadv
-	.word compat_sys_pwritev, compat_sys_rt_tgsigqueueinfo, sys_perf_event_open
+	.word compat_sys_pwritev, compat_sys_rt_tgsigqueueinfo, sys_perf_event_open, compat_sys_recvmmsg
 
 #endif /* CONFIG_COMPAT */
 
@@ -158,4 +158,4 @@ sys_call_table:
 /*310*/	.word sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate
 	.word sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1
 /*320*/	.word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv
-	.word sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open
+	.word sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open, sys_recvmmsg
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 74619c4f9fda..11a6c79d5f46 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -832,4 +832,5 @@ ia32_sys_call_table:
 	.quad compat_sys_pwritev
 	.quad compat_sys_rt_tgsigqueueinfo	/* 335 */
 	.quad sys_perf_event_open
+	.quad compat_sys_recvmmsg
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 6fb3c209a7e3..3baf379fa840 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -342,10 +342,11 @@
 #define __NR_pwritev		334
 #define __NR_rt_tgsigqueueinfo	335
 #define __NR_perf_event_open	336
+#define __NR_recvmmsg		337
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 337
+#define NR_syscalls 338
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 8d3ad0adbc68..4843f7ba754a 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -661,6 +661,8 @@ __SYSCALL(__NR_pwritev, sys_pwritev)
 __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
 #define __NR_perf_event_open			298
 __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
+#define __NR_recvmmsg				299
+__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 0157cd26d7cc..70c2125d55b9 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -336,3 +336,4 @@ ENTRY(sys_call_table)
 	.long sys_pwritev
 	.long sys_rt_tgsigqueueinfo	/* 335 */
 	.long sys_perf_event_open
+	.long sys_recvmmsg
diff --git a/arch/xtensa/include/asm/unistd.h b/arch/xtensa/include/asm/unistd.h
index c092c8fbb2cf..4e55dc763021 100644
--- a/arch/xtensa/include/asm/unistd.h
+++ b/arch/xtensa/include/asm/unistd.h
@@ -681,8 +681,10 @@ __SYSCALL(304, sys_signalfd, 3)
 __SYSCALL(305, sys_ni_syscall, 0)
 #define __NR_eventfd				306
 __SYSCALL(306, sys_eventfd, 1)
+#define __NR_recvmmsg				307
+__SYSCALL(307, sys_recvmmsg, 5)
 
-#define __NR_syscall_count			307
+#define __NR_syscall_count			308
 
 /*
  * sysxtensa syscall handler
diff --git a/include/linux/net.h b/include/linux/net.h
index 529a0931711d..b42bb60fe92f 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -41,6 +41,7 @@
 #define SYS_SENDMSG	16		/* sys_sendmsg(2)		*/
 #define SYS_RECVMSG	17		/* sys_recvmsg(2)		*/
 #define SYS_ACCEPT4	18		/* sys_accept4(2)		*/
+#define SYS_RECVMMSG	19		/* sys_recvmmsg(2)		*/
 
 typedef enum {
 	SS_FREE = 0,			/* not allocated		*/
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 3273a0c5043b..59966f12990c 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -65,6 +65,12 @@ struct msghdr {
 	unsigned	msg_flags;
 };
 
+/* For recvmmsg/sendmmsg */
+struct mmsghdr {
+	struct msghdr   msg_hdr;
+	unsigned        msg_len;
+};
+
 /*
  *	POSIX 1003.1g - ancillary data object information
  *	Ancillary data consits of a sequence of pairs of
@@ -312,6 +318,10 @@ extern int move_addr_to_user(struct sockaddr *kaddr, int klen, void __user *uadd
 extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr);
 extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
 
+struct timespec;
+
+extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
+			  unsigned int flags, struct timespec *timeout);
 #endif
 #endif /* not kernel and not glibc */
 #endif /* _LINUX_SOCKET_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a990ace1a838..714f063a3e6d 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -25,6 +25,7 @@ struct linux_dirent64;
 struct list_head;
 struct msgbuf;
 struct msghdr;
+struct mmsghdr;
 struct msqid_ds;
 struct new_utsname;
 struct nfsctl_arg;
@@ -677,6 +678,9 @@ asmlinkage long sys_recv(int, void __user *, size_t, unsigned);
 asmlinkage long sys_recvfrom(int, void __user *, size_t, unsigned,
 				struct sockaddr __user *, int __user *);
 asmlinkage long sys_recvmsg(int fd, struct msghdr __user *msg, unsigned flags);
+asmlinkage long sys_recvmmsg(int fd, struct mmsghdr __user *msg,
+			     unsigned int vlen, unsigned flags,
+			     struct timespec __user *timeout);
 asmlinkage long sys_socket(int, int, int);
 asmlinkage long sys_socketpair(int, int, int, int __user *);
 asmlinkage long sys_socketcall(int call, unsigned long __user *args);
diff --git a/include/net/compat.h b/include/net/compat.h
index 7c3002832d05..9679f05e9896 100644
--- a/include/net/compat.h
+++ b/include/net/compat.h
@@ -18,6 +18,11 @@ struct compat_msghdr {
 	compat_uint_t	msg_flags;
 };
 
+struct compat_mmsghdr {
+	struct compat_msghdr msg_hdr;
+	compat_uint_t        msg_len;
+};
+
 struct compat_cmsghdr {
 	compat_size_t	cmsg_len;
 	compat_int_t	cmsg_level;
@@ -35,6 +40,9 @@ extern int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *);
 extern int verify_compat_iovec(struct msghdr *, struct iovec *, struct sockaddr *, int);
 extern asmlinkage long compat_sys_sendmsg(int,struct compat_msghdr __user *,unsigned);
 extern asmlinkage long compat_sys_recvmsg(int,struct compat_msghdr __user *,unsigned);
+extern asmlinkage long compat_sys_recvmmsg(int, struct compat_mmsghdr __user *,
+					   unsigned, unsigned,
+					   struct timespec __user *);
 extern asmlinkage long compat_sys_getsockopt(int, int, int, char __user *, int __user *);
 extern int put_cmsg_compat(struct msghdr*, int, int, int, void *);
 
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e06d0b8d1951..f050ba85d420 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -48,8 +48,10 @@ cond_syscall(sys_shutdown);
 cond_syscall(sys_sendmsg);
 cond_syscall(compat_sys_sendmsg);
 cond_syscall(sys_recvmsg);
+cond_syscall(sys_recvmmsg);
 cond_syscall(compat_sys_recvmsg);
 cond_syscall(compat_sys_recvfrom);
+cond_syscall(compat_sys_recvmmsg);
 cond_syscall(sys_socketcall);
 cond_syscall(sys_futex);
 cond_syscall(compat_sys_futex);
diff --git a/net/compat.c b/net/compat.c
index a407c3addbae..e13f5256fd20 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -727,10 +727,10 @@ EXPORT_SYMBOL(compat_mc_getsockopt);
 
 /* Argument list sizes for compat_sys_socketcall */
 #define AL(x) ((x) * sizeof(u32))
-static unsigned char nas[19]={AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
+static unsigned char nas[20]={AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
 				AL(3),AL(3),AL(4),AL(4),AL(4),AL(6),
 				AL(6),AL(2),AL(5),AL(5),AL(3),AL(3),
-				AL(4)};
+				AL(4),AL(5)};
 #undef AL
 
 asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, unsigned flags)
@@ -755,13 +755,36 @@ asmlinkage long compat_sys_recvfrom(int fd, void __user *buf, size_t len,
 	return sys_recvfrom(fd, buf, len, flags | MSG_CMSG_COMPAT, addr, addrlen);
 }
 
+asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg,
+				    unsigned vlen, unsigned int flags,
+				    struct timespec __user *timeout)
+{
+	int datagrams;
+	struct timespec ktspec;
+	struct compat_timespec __user *utspec =
+			(struct compat_timespec __user *)timeout;
+
+	if (get_user(ktspec.tv_sec, &utspec->tv_sec) ||
+	    get_user(ktspec.tv_nsec, &utspec->tv_nsec))
+		return -EFAULT;
+
+	datagrams = __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
+				   flags | MSG_CMSG_COMPAT, &ktspec);
+	if (datagrams > 0 &&
+	    (put_user(ktspec.tv_sec, &utspec->tv_sec) ||
+	     put_user(ktspec.tv_nsec, &utspec->tv_nsec)))
+		datagrams = -EFAULT;
+
+	return datagrams;
+}
+
 asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
 {
 	int ret;
 	u32 a[6];
 	u32 a0, a1;
 
-	if (call < SYS_SOCKET || call > SYS_ACCEPT4)
+	if (call < SYS_SOCKET || call > SYS_RECVMMSG)
 		return -EINVAL;
 	if (copy_from_user(a, args, nas[call]))
 		return -EFAULT;
@@ -823,6 +846,10 @@ asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
 	case SYS_RECVMSG:
 		ret = compat_sys_recvmsg(a0, compat_ptr(a1), a[2]);
 		break;
+	case SYS_RECVMMSG:
+		ret = compat_sys_recvmmsg(a0, compat_ptr(a1), a[2], a[3],
+					  compat_ptr(a[4]));
+		break;
 	case SYS_ACCEPT4:
 		ret = sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), a[3]);
 		break;
diff --git a/net/socket.c b/net/socket.c
index 807935693846..9dff31c9b799 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -683,10 +683,9 @@ void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
 }
 EXPORT_SYMBOL_GPL(sock_recv_ts_and_drops);
 
-static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
-				 struct msghdr *msg, size_t size, int flags)
+static inline int __sock_recvmsg_nosec(struct kiocb *iocb, struct socket *sock,
+				       struct msghdr *msg, size_t size, int flags)
 {
-	int err;
 	struct sock_iocb *si = kiocb_to_siocb(iocb);
 
 	si->sock = sock;
@@ -695,13 +694,17 @@ static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 	si->size = size;
 	si->flags = flags;
 
-	err = security_socket_recvmsg(sock, msg, size, flags);
-	if (err)
-		return err;
-
 	return sock->ops->recvmsg(iocb, sock, msg, size, flags);
 }
 
+static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
+				 struct msghdr *msg, size_t size, int flags)
+{
+	int err = security_socket_recvmsg(sock, msg, size, flags);
+
+	return err ?: __sock_recvmsg_nosec(iocb, sock, msg, size, flags);
+}
+
 int sock_recvmsg(struct socket *sock, struct msghdr *msg,
 		 size_t size, int flags)
 {
@@ -717,6 +720,21 @@ int sock_recvmsg(struct socket *sock, struct msghdr *msg,
 	return ret;
 }
 
+static int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
+			      size_t size, int flags)
+{
+	struct kiocb iocb;
+	struct sock_iocb siocb;
+	int ret;
+
+	init_sync_kiocb(&iocb, NULL);
+	iocb.private = &siocb;
+	ret = __sock_recvmsg_nosec(&iocb, sock, msg, size, flags);
+	if (-EIOCBQUEUED == ret)
+		ret = wait_on_sync_kiocb(&iocb);
+	return ret;
+}
+
 int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
 		   struct kvec *vec, size_t num, size_t size, int flags)
 {
@@ -1983,22 +2001,15 @@ out:
 	return err;
 }
 
-/*
- *	BSD recvmsg interface
- */
-
-SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
-		unsigned int, flags)
+static int __sys_recvmsg(struct socket *sock, struct msghdr __user *msg,
+			 struct msghdr *msg_sys, unsigned flags, int nosec)
 {
 	struct compat_msghdr __user *msg_compat =
 	    (struct compat_msghdr __user *)msg;
-	struct socket *sock;
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
-	struct msghdr msg_sys;
 	unsigned long cmsg_ptr;
 	int err, iov_size, total_len, len;
-	int fput_needed;
 
 	/* kernel mode address */
 	struct sockaddr_storage addr;
@@ -2008,27 +2019,23 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
 	int __user *uaddr_len;
 
 	if (MSG_CMSG_COMPAT & flags) {
-		if (get_compat_msghdr(&msg_sys, msg_compat))
+		if (get_compat_msghdr(msg_sys, msg_compat))
 			return -EFAULT;
 	}
-	else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr)))
+	else if (copy_from_user(msg_sys, msg, sizeof(struct msghdr)))
 		return -EFAULT;
 
-	sock = sockfd_lookup_light(fd, &err, &fput_needed);
-	if (!sock)
-		goto out;
-
 	err = -EMSGSIZE;
-	if (msg_sys.msg_iovlen > UIO_MAXIOV)
-		goto out_put;
+	if (msg_sys->msg_iovlen > UIO_MAXIOV)
+		goto out;
 
 	/* Check whether to allocate the iovec area */
 	err = -ENOMEM;
-	iov_size = msg_sys.msg_iovlen * sizeof(struct iovec);
-	if (msg_sys.msg_iovlen > UIO_FASTIOV) {
+	iov_size = msg_sys->msg_iovlen * sizeof(struct iovec);
+	if (msg_sys->msg_iovlen > UIO_FASTIOV) {
 		iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL);
 		if (!iov)
-			goto out_put;
+			goto out;
 	}
 
 	/*
@@ -2036,46 +2043,47 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
 	 *      kernel msghdr to use the kernel address space)
 	 */
 
-	uaddr = (__force void __user *)msg_sys.msg_name;
+	uaddr = (__force void __user *)msg_sys->msg_name;
 	uaddr_len = COMPAT_NAMELEN(msg);
 	if (MSG_CMSG_COMPAT & flags) {
-		err = verify_compat_iovec(&msg_sys, iov,
+		err = verify_compat_iovec(msg_sys, iov,
 					  (struct sockaddr *)&addr,
 					  VERIFY_WRITE);
 	} else
-		err = verify_iovec(&msg_sys, iov,
+		err = verify_iovec(msg_sys, iov,
 				   (struct sockaddr *)&addr,
 				   VERIFY_WRITE);
 	if (err < 0)
 		goto out_freeiov;
 	total_len = err;
 
-	cmsg_ptr = (unsigned long)msg_sys.msg_control;
-	msg_sys.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
+	cmsg_ptr = (unsigned long)msg_sys->msg_control;
+	msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
 
 	if (sock->file->f_flags & O_NONBLOCK)
 		flags |= MSG_DONTWAIT;
-	err = sock_recvmsg(sock, &msg_sys, total_len, flags);
+	err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys,
+							  total_len, flags);
 	if (err < 0)
 		goto out_freeiov;
 	len = err;
 
 	if (uaddr != NULL) {
 		err = move_addr_to_user((struct sockaddr *)&addr,
-					msg_sys.msg_namelen, uaddr,
+					msg_sys->msg_namelen, uaddr,
 					uaddr_len);
 		if (err < 0)
 			goto out_freeiov;
 	}
-	err = __put_user((msg_sys.msg_flags & ~MSG_CMSG_COMPAT),
+	err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT),
 			 COMPAT_FLAGS(msg));
 	if (err)
 		goto out_freeiov;
 	if (MSG_CMSG_COMPAT & flags)
-		err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr,
+		err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
 				 &msg_compat->msg_controllen);
 	else
-		err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr,
+		err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
 				 &msg->msg_controllen);
 	if (err)
 		goto out_freeiov;
@@ -2084,21 +2092,150 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
 out_freeiov:
 	if (iov != iovstack)
 		sock_kfree_s(sock->sk, iov, iov_size);
-out_put:
+out:
+	return err;
+}
+
+/*
+ *	BSD recvmsg interface
+ */
+
+SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
+		unsigned int, flags)
+{
+	int fput_needed, err;
+	struct msghdr msg_sys;
+	struct socket *sock = sockfd_lookup_light(fd, &err, &fput_needed);
+
+	if (!sock)
+		goto out;
+
+	err = __sys_recvmsg(sock, msg, &msg_sys, flags, 0);
+
 	fput_light(sock->file, fput_needed);
 out:
 	return err;
 }
 
-#ifdef __ARCH_WANT_SYS_SOCKETCALL
+/*
+ *     Linux recvmmsg interface
+ */
+
+int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
+		   unsigned int flags, struct timespec *timeout)
+{
+	int fput_needed, err, datagrams;
+	struct socket *sock;
+	struct mmsghdr __user *entry;
+	struct msghdr msg_sys;
+	struct timespec end_time;
+
+	if (timeout &&
+	    poll_select_set_timeout(&end_time, timeout->tv_sec,
+				    timeout->tv_nsec))
+		return -EINVAL;
+
+	datagrams = 0;
+
+	sock = sockfd_lookup_light(fd, &err, &fput_needed);
+	if (!sock)
+		return err;
+
+	err = sock_error(sock->sk);
+	if (err)
+		goto out_put;
+
+	entry = mmsg;
+
+	while (datagrams < vlen) {
+		/*
+		 * No need to ask LSM for more than the first datagram.
+		 */
+		err = __sys_recvmsg(sock, (struct msghdr __user *)entry,
+				    &msg_sys, flags, datagrams);
+		if (err < 0)
+			break;
+		err = put_user(err, &entry->msg_len);
+		if (err)
+			break;
+		++entry;
+		++datagrams;
+
+		if (timeout) {
+			ktime_get_ts(timeout);
+			*timeout = timespec_sub(end_time, *timeout);
+			if (timeout->tv_sec < 0) {
+				timeout->tv_sec = timeout->tv_nsec = 0;
+				break;
+			}
+
+			/* Timeout, return less than vlen datagrams */
+			if (timeout->tv_nsec == 0 && timeout->tv_sec == 0)
+				break;
+		}
+
+		/* Out of band data, return right away */
+		if (msg_sys.msg_flags & MSG_OOB)
+			break;
+	}
+
+out_put:
+	fput_light(sock->file, fput_needed);
 
+	if (err == 0)
+		return datagrams;
+
+	if (datagrams != 0) {
+		/*
+		 * We may return less entries than requested (vlen) if the
+		 * sock is non block and there aren't enough datagrams...
+		 */
+		if (err != -EAGAIN) {
+			/*
+			 * ... or  if recvmsg returns an error after we
+			 * received some datagrams, where we record the
+			 * error to return on the next call or if the
+			 * app asks about it using getsockopt(SO_ERROR).
+			 */
+			sock->sk->sk_err = -err;
+		}
+
+		return datagrams;
+	}
+
+	return err;
+}
+
+SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg,
+		unsigned int, vlen, unsigned int, flags,
+		struct timespec __user *, timeout)
+{
+	int datagrams;
+	struct timespec timeout_sys;
+
+	if (!timeout)
+		return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL);
+
+	if (copy_from_user(&timeout_sys, timeout, sizeof(timeout_sys)))
+		return -EFAULT;
+
+	datagrams = __sys_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys);
+
+	if (datagrams > 0 &&
+	    copy_to_user(timeout, &timeout_sys, sizeof(timeout_sys)))
+		datagrams = -EFAULT;
+
+	return datagrams;
+}
+
+#ifdef __ARCH_WANT_SYS_SOCKETCALL
 /* Argument list sizes for sys_socketcall */
 #define AL(x) ((x) * sizeof(unsigned long))
-static const unsigned char nargs[19]={
+static const unsigned char nargs[20] = {
 	AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
 	AL(3),AL(3),AL(4),AL(4),AL(4),AL(6),
 	AL(6),AL(2),AL(5),AL(5),AL(3),AL(3),
-	AL(4)
+	AL(4),AL(5)
 };
 
 #undef AL
@@ -2118,7 +2255,7 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
 	int err;
 	unsigned int len;
 
-	if (call < 1 || call > SYS_ACCEPT4)
+	if (call < 1 || call > SYS_RECVMMSG)
 		return -EINVAL;
 
 	len = nargs[call];
@@ -2196,6 +2333,10 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
 	case SYS_RECVMSG:
 		err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
 		break;
+	case SYS_RECVMMSG:
+		err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3],
+				   (struct timespec __user *)a[4]);
+		break;
 	case SYS_ACCEPT4:
 		err = sys_accept4(a0, (struct sockaddr __user *)a1,
 				  (int __user *)a[2], a[3]);
-- 
cgit v1.2.3


From 61321bbd6235ca9a40ba3bc249e8906cc66233c3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 7 Oct 2009 17:11:23 +0000
Subject: net: Add netdev_alloc_skb_ip_align() helper

Instead of hardcoding NET_IP_ALIGN stuff in various network drivers,
we can add a helper around netdev_alloc_skb()

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 8c866b5cb97b..0c68fbd6faac 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1491,6 +1491,16 @@ static inline struct sk_buff *netdev_alloc_skb(struct net_device *dev,
 	return __netdev_alloc_skb(dev, length, GFP_ATOMIC);
 }
 
+static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
+		unsigned int length)
+{
+	struct sk_buff *skb = netdev_alloc_skb(dev, length + NET_IP_ALIGN);
+
+	if (NET_IP_ALIGN && skb)
+		skb_reserve(skb, NET_IP_ALIGN);
+	return skb;
+}
+
 extern struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask);
 
 /**
-- 
cgit v1.2.3


From a6e4bc5304033e434fabccabb230b8e9ff55d76f Mon Sep 17 00:00:00 2001
From: Wolfgang Grandegger <wg@grandegger.com>
Date: Thu, 8 Oct 2009 22:17:11 +0000
Subject: can: make the number of echo skb's configurable

This patch allows the CAN controller driver to define the number of echo
skb's used for the local loopback (echo), as suggested by Kurt Van
Dijck, with the function:

  struct net_device *alloc_candev(int sizeof_priv,
                                  unsigned int echo_skb_max);

The CAN drivers have been adapted accordingly. For the ems_usb driver,
as suggested by Sebastian Haas, the number of echo skb's has been
increased to 10, which improves the transmission performance a lot.

Signed-off-by: Wolfgang Grandegger <wg@grandegger.com>
Signed-off-by: Kurt Van Dijck <kurt.van.dijck@eia.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/can/at91_can.c        |  2 +-
 drivers/net/can/dev.c             | 32 ++++++++++++++++++++++++++------
 drivers/net/can/sja1000/sja1000.c |  3 ++-
 drivers/net/can/sja1000/sja1000.h |  2 ++
 drivers/net/can/ti_hecc.c         |  6 +-----
 drivers/net/can/usb/ems_usb.c     |  4 ++--
 include/linux/can/dev.h           | 16 ++++++++--------
 7 files changed, 42 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/can/at91_can.c b/drivers/net/can/at91_can.c
index f67ae285a35a..b13fd9114130 100644
--- a/drivers/net/can/at91_can.c
+++ b/drivers/net/can/at91_can.c
@@ -1087,7 +1087,7 @@ static int __init at91_can_probe(struct platform_device *pdev)
 		goto exit_release;
 	}
 
-	dev = alloc_candev(sizeof(struct at91_priv));
+	dev = alloc_candev(sizeof(struct at91_priv), AT91_MB_TX_NUM);
 	if (!dev) {
 		err = -ENOMEM;
 		goto exit_iounmap;
diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index f0b9a1e1db46..39b99f57c265 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -245,7 +245,7 @@ static void can_flush_echo_skb(struct net_device *dev)
 	struct net_device_stats *stats = &dev->stats;
 	int i;
 
-	for (i = 0; i < CAN_ECHO_SKB_MAX; i++) {
+	for (i = 0; i < priv->echo_skb_max; i++) {
 		if (priv->echo_skb[i]) {
 			kfree_skb(priv->echo_skb[i]);
 			priv->echo_skb[i] = NULL;
@@ -262,10 +262,13 @@ static void can_flush_echo_skb(struct net_device *dev)
  * of the device driver. The driver must protect access to
  * priv->echo_skb, if necessary.
  */
-void can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, int idx)
+void can_put_echo_skb(struct sk_buff *skb, struct net_device *dev,
+		      unsigned int idx)
 {
 	struct can_priv *priv = netdev_priv(dev);
 
+	BUG_ON(idx >= priv->echo_skb_max);
+
 	/* check flag whether this packet has to be looped back */
 	if (!(dev->flags & IFF_ECHO) || skb->pkt_type != PACKET_LOOPBACK) {
 		kfree_skb(skb);
@@ -311,10 +314,12 @@ EXPORT_SYMBOL_GPL(can_put_echo_skb);
  * is handled in the device driver. The driver must protect
  * access to priv->echo_skb, if necessary.
  */
-void can_get_echo_skb(struct net_device *dev, int idx)
+void can_get_echo_skb(struct net_device *dev, unsigned int idx)
 {
 	struct can_priv *priv = netdev_priv(dev);
 
+	BUG_ON(idx >= priv->echo_skb_max);
+
 	if (priv->echo_skb[idx]) {
 		netif_rx(priv->echo_skb[idx]);
 		priv->echo_skb[idx] = NULL;
@@ -327,10 +332,12 @@ EXPORT_SYMBOL_GPL(can_get_echo_skb);
   *
   * The function is typically called when TX failed.
   */
-void can_free_echo_skb(struct net_device *dev, int idx)
+void can_free_echo_skb(struct net_device *dev, unsigned int idx)
 {
 	struct can_priv *priv = netdev_priv(dev);
 
+	BUG_ON(idx >= priv->echo_skb_max);
+
 	if (priv->echo_skb[idx]) {
 		kfree_skb(priv->echo_skb[idx]);
 		priv->echo_skb[idx] = NULL;
@@ -445,17 +452,30 @@ static void can_setup(struct net_device *dev)
 /*
  * Allocate and setup space for the CAN network device
  */
-struct net_device *alloc_candev(int sizeof_priv)
+struct net_device *alloc_candev(int sizeof_priv, unsigned int echo_skb_max)
 {
 	struct net_device *dev;
 	struct can_priv *priv;
+	int size;
 
-	dev = alloc_netdev(sizeof_priv, "can%d", can_setup);
+	if (echo_skb_max)
+		size = ALIGN(sizeof_priv, sizeof(struct sk_buff *)) +
+			echo_skb_max * sizeof(struct sk_buff *);
+	else
+		size = sizeof_priv;
+
+	dev = alloc_netdev(size, "can%d", can_setup);
 	if (!dev)
 		return NULL;
 
 	priv = netdev_priv(dev);
 
+	if (echo_skb_max) {
+		priv->echo_skb_max = echo_skb_max;
+		priv->echo_skb = (void *)priv +
+			ALIGN(sizeof_priv, sizeof(struct sk_buff *));
+	}
+
 	priv->state = CAN_STATE_STOPPED;
 
 	init_timer(&priv->restart_timer);
diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c
index 16d2ecd2a3b7..96d8be4253f8 100644
--- a/drivers/net/can/sja1000/sja1000.c
+++ b/drivers/net/can/sja1000/sja1000.c
@@ -565,7 +565,8 @@ struct net_device *alloc_sja1000dev(int sizeof_priv)
 	struct net_device *dev;
 	struct sja1000_priv *priv;
 
-	dev = alloc_candev(sizeof(struct sja1000_priv) + sizeof_priv);
+	dev = alloc_candev(sizeof(struct sja1000_priv) + sizeof_priv,
+		SJA1000_ECHO_SKB_MAX);
 	if (!dev)
 		return NULL;
 
diff --git a/drivers/net/can/sja1000/sja1000.h b/drivers/net/can/sja1000/sja1000.h
index 302d2c763ad7..97a622b9302f 100644
--- a/drivers/net/can/sja1000/sja1000.h
+++ b/drivers/net/can/sja1000/sja1000.h
@@ -50,6 +50,8 @@
 #include <linux/can/dev.h>
 #include <linux/can/platform/sja1000.h>
 
+#define SJA1000_ECHO_SKB_MAX	1 /* the SJA1000 has one TX buffer object */
+
 #define SJA1000_MAX_IRQ 20	/* max. number of interrupts handled in ISR */
 
 /* SJA1000 registers - manual section 6.4 (Pelican Mode) */
diff --git a/drivers/net/can/ti_hecc.c b/drivers/net/can/ti_hecc.c
index 814e6c5c6386..23a7128e4eb7 100644
--- a/drivers/net/can/ti_hecc.c
+++ b/drivers/net/can/ti_hecc.c
@@ -74,10 +74,6 @@ MODULE_VERSION(HECC_MODULE_VERSION);
 #define HECC_MB_TX_SHIFT	2 /* as per table above */
 #define HECC_MAX_TX_MBOX	BIT(HECC_MB_TX_SHIFT)
 
-#if (HECC_MAX_TX_MBOX > CAN_ECHO_SKB_MAX)
-#error "HECC: MAX TX mailboxes should be equal or less than CAN_ECHO_SKB_MAX"
-#endif
-
 #define HECC_TX_PRIO_SHIFT	(HECC_MB_TX_SHIFT)
 #define HECC_TX_PRIO_MASK	(MAX_TX_PRIO << HECC_MB_TX_SHIFT)
 #define HECC_TX_MB_MASK		(HECC_MAX_TX_MBOX - 1)
@@ -902,7 +898,7 @@ static int ti_hecc_probe(struct platform_device *pdev)
 		goto probe_exit_free_region;
 	}
 
-	ndev = alloc_candev(sizeof(struct ti_hecc_priv));
+	ndev = alloc_candev(sizeof(struct ti_hecc_priv), HECC_MAX_TX_MBOX);
 	if (!ndev) {
 		dev_err(&pdev->dev, "alloc_candev failed\n");
 		err = -ENOMEM;
diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c
index 9012e0abc626..a65f56a9cd3d 100644
--- a/drivers/net/can/usb/ems_usb.c
+++ b/drivers/net/can/usb/ems_usb.c
@@ -232,7 +232,7 @@ MODULE_DEVICE_TABLE(usb, ems_usb_table);
 #define INTR_IN_BUFFER_SIZE 4
 
 #define MAX_RX_URBS 10
-#define MAX_TX_URBS CAN_ECHO_SKB_MAX
+#define MAX_TX_URBS 10
 
 struct ems_usb;
 
@@ -1012,7 +1012,7 @@ static int ems_usb_probe(struct usb_interface *intf,
 	struct ems_usb *dev;
 	int i, err = -ENOMEM;
 
-	netdev = alloc_candev(sizeof(struct ems_usb));
+	netdev = alloc_candev(sizeof(struct ems_usb), MAX_TX_URBS);
 	if (!netdev) {
 		dev_err(netdev->dev.parent, "Couldn't alloc candev\n");
 		return -ENOMEM;
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 5824b20b5fcb..1d3f7f00e3af 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -29,8 +29,6 @@ enum can_mode {
 /*
  * CAN common private data
  */
-#define CAN_ECHO_SKB_MAX  4
-
 struct can_priv {
 	struct can_device_stats can_stats;
 
@@ -44,15 +42,16 @@ struct can_priv {
 	int restart_ms;
 	struct timer_list restart_timer;
 
-	struct sk_buff *echo_skb[CAN_ECHO_SKB_MAX];
-
 	int (*do_set_bittiming)(struct net_device *dev);
 	int (*do_set_mode)(struct net_device *dev, enum can_mode mode);
 	int (*do_get_state)(const struct net_device *dev,
 			    enum can_state *state);
+
+	unsigned int echo_skb_max;
+	struct sk_buff **echo_skb;
 };
 
-struct net_device *alloc_candev(int sizeof_priv);
+struct net_device *alloc_candev(int sizeof_priv, unsigned int echo_skb_max);
 void free_candev(struct net_device *dev);
 
 int open_candev(struct net_device *dev);
@@ -64,8 +63,9 @@ void unregister_candev(struct net_device *dev);
 int can_restart_now(struct net_device *dev);
 void can_bus_off(struct net_device *dev);
 
-void can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, int idx);
-void can_get_echo_skb(struct net_device *dev, int idx);
-void can_free_echo_skb(struct net_device *dev, int idx);
+void can_put_echo_skb(struct sk_buff *skb, struct net_device *dev,
+		      unsigned int idx);
+void can_get_echo_skb(struct net_device *dev, unsigned int idx);
+void can_free_echo_skb(struct net_device *dev, unsigned int idx);
 
 #endif /* CAN_DEV_H */
-- 
cgit v1.2.3


From aace495933a981274b6491d71b915165a61defdc Mon Sep 17 00:00:00 2001
From: Manuel Lauss <manuel.lauss@googlemail.com>
Date: Tue, 13 Oct 2009 07:25:49 +0000
Subject: net: smsc911x: allow platform_data to specify mac address

Extend the driver to accept a MAC address specified in platform_data.

Signed-off-by: Manuel Lauss <manuel.lauss@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/smsc911x.c   | 3 +++
 include/linux/smsc911x.h | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/smsc911x.c b/drivers/net/smsc911x.c
index ccdd196f5297..6a9f51d1d9f2 100644
--- a/drivers/net/smsc911x.c
+++ b/drivers/net/smsc911x.c
@@ -2071,6 +2071,9 @@ static int __devinit smsc911x_drv_probe(struct platform_device *pdev)
 	if (is_valid_ether_addr(dev->dev_addr)) {
 		smsc911x_set_hw_mac_address(pdata, dev->dev_addr);
 		SMSC_TRACE(PROBE, "MAC Address is specified by configuration");
+	} else if (is_valid_ether_addr(pdata->config.mac)) {
+		memcpy(dev->dev_addr, pdata->config.mac, 6);
+		SMSC_TRACE(PROBE, "MAC Address specified by platform data");
 	} else {
 		/* Try reading mac address from device. if EEPROM is present
 		 * it will already have been set */
diff --git a/include/linux/smsc911x.h b/include/linux/smsc911x.h
index 5241e4fb4eca..7144e8aa1e41 100644
--- a/include/linux/smsc911x.h
+++ b/include/linux/smsc911x.h
@@ -30,6 +30,7 @@ struct smsc911x_platform_config {
 	unsigned int irq_type;
 	unsigned int flags;
 	phy_interface_t phy_interface;
+	unsigned char mac[6];
 };
 
 /* Constants for platform_device irq polarity configuration */
-- 
cgit v1.2.3


From 8e85973efc87dfae8508f1a3440fd44612897458 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Thu, 8 Oct 2009 00:41:59 +0200
Subject: firewire: optimize config ROM creation

The config ROM image of the local node was created in CPU byte order,
then a temporary big endian copy was created to compute the CRC, and
finally the card driver created its own big endian copy.

We now generate it in big endian byte order in the first place to avoid
one byte order conversion and the temporary on-stack copy of the ROM
image (1000 bytes stack usage in process context).  Furthermore, two
1000 bytes memset()s are replaced by one 1000 bytes - ROM length sized
memset.

The trivial fw_memcpy_{from,to}_be32() helpers are now superfluous and
removed.  The newly added __compute_block_crc() function will be folded
into fw_compute_block_crc() in a subsequent change.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-card.c | 62 ++++++++++++++++++++++++++------------------
 drivers/firewire/core.h      |  7 ++---
 drivers/firewire/ohci.c      | 30 +++++++++++++--------
 include/linux/firewire.h     | 14 ----------
 4 files changed, 60 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c
index 33898b63cdf7..f73e3bdfc84c 100644
--- a/drivers/firewire/core-card.c
+++ b/drivers/firewire/core-card.c
@@ -38,6 +38,18 @@
 
 #include "core.h"
 
+static int __compute_block_crc(__be32 *block)
+{
+	int length;
+	u16 crc;
+
+	length = (be32_to_cpu(block[0]) >> 16) & 0xff;
+	crc = crc_itu_t(0, (u8 *)&block[1], length * 4);
+	*block |= cpu_to_be32(crc);
+
+	return length;
+}
+
 int fw_compute_block_crc(u32 *block)
 {
 	__be32 be32_block[256];
@@ -72,11 +84,11 @@ static int descriptor_count;
 #define BIB_CMC			((1) << 30)
 #define BIB_IMC			((1) << 31)
 
-static u32 *generate_config_rom(struct fw_card *card, size_t *config_rom_length)
+static __be32 *generate_config_rom(struct fw_card *card, size_t *rom_length)
 {
 	struct fw_descriptor *desc;
-	static u32 config_rom[256];
-	int i, j, length;
+	static __be32 config_rom[256];
+	int i, j, k, length;
 
 	/*
 	 * Initialize contents of config rom buffer.  On the OHCI
@@ -87,40 +99,39 @@ static u32 *generate_config_rom(struct fw_card *card, size_t *config_rom_length)
 	 * the version stored in the OHCI registers.
 	 */
 
-	memset(config_rom, 0, sizeof(config_rom));
-	config_rom[0] = BIB_CRC_LENGTH(4) | BIB_INFO_LENGTH(4) | BIB_CRC(0);
-	config_rom[1] = 0x31333934;
-
-	config_rom[2] =
+	config_rom[0] = cpu_to_be32(
+		BIB_CRC_LENGTH(4) | BIB_INFO_LENGTH(4) | BIB_CRC(0));
+	config_rom[1] = cpu_to_be32(0x31333934);
+	config_rom[2] = cpu_to_be32(
 		BIB_LINK_SPEED(card->link_speed) |
 		BIB_GENERATION(card->config_rom_generation++ % 14 + 2) |
 		BIB_MAX_ROM(2) |
 		BIB_MAX_RECEIVE(card->max_receive) |
-		BIB_BMC | BIB_ISC | BIB_CMC | BIB_IMC;
-	config_rom[3] = card->guid >> 32;
-	config_rom[4] = card->guid;
+		BIB_BMC | BIB_ISC | BIB_CMC | BIB_IMC);
+	config_rom[3] = cpu_to_be32(card->guid >> 32);
+	config_rom[4] = cpu_to_be32(card->guid);
 
 	/* Generate root directory. */
-	i = 5;
-	config_rom[i++] = 0;
-	config_rom[i++] = 0x0c0083c0; /* node capabilities */
-	j = i + descriptor_count;
+	config_rom[6] = cpu_to_be32(0x0c0083c0); /* node capabilities */
+	i = 7;
+	j = 7 + descriptor_count;
 
 	/* Generate root directory entries for descriptors. */
 	list_for_each_entry (desc, &descriptor_list, link) {
 		if (desc->immediate > 0)
-			config_rom[i++] = desc->immediate;
-		config_rom[i] = desc->key | (j - i);
+			config_rom[i++] = cpu_to_be32(desc->immediate);
+		config_rom[i] = cpu_to_be32(desc->key | (j - i));
 		i++;
 		j += desc->length;
 	}
 
 	/* Update root directory length. */
-	config_rom[5] = (i - 5 - 1) << 16;
+	config_rom[5] = cpu_to_be32((i - 5 - 1) << 16);
 
 	/* End of root directory, now copy in descriptors. */
 	list_for_each_entry (desc, &descriptor_list, link) {
-		memcpy(&config_rom[i], desc->data, desc->length * 4);
+		for (k = 0; k < desc->length; k++)
+			config_rom[i + k] = cpu_to_be32(desc->data[k]);
 		i += desc->length;
 	}
 
@@ -129,9 +140,9 @@ static u32 *generate_config_rom(struct fw_card *card, size_t *config_rom_length)
 	 * the bus info block, which is always the case for this
 	 * implementation. */
 	for (i = 0; i < j; i += length + 1)
-		length = fw_compute_block_crc(config_rom + i);
+		length = __compute_block_crc(config_rom + i);
 
-	*config_rom_length = j;
+	*rom_length = j;
 
 	return config_rom;
 }
@@ -139,7 +150,7 @@ static u32 *generate_config_rom(struct fw_card *card, size_t *config_rom_length)
 static void update_config_roms(void)
 {
 	struct fw_card *card;
-	u32 *config_rom;
+	__be32 *config_rom;
 	size_t length;
 
 	list_for_each_entry (card, &card_list, link) {
@@ -432,7 +443,7 @@ EXPORT_SYMBOL(fw_card_initialize);
 int fw_card_add(struct fw_card *card,
 		u32 max_receive, u32 link_speed, u64 guid)
 {
-	u32 *config_rom;
+	__be32 *config_rom;
 	size_t length;
 	int ret;
 
@@ -462,7 +473,8 @@ EXPORT_SYMBOL(fw_card_add);
  * shutdown still need to be provided by the card driver.
  */
 
-static int dummy_enable(struct fw_card *card, u32 *config_rom, size_t length)
+static int dummy_enable(struct fw_card *card,
+			const __be32 *config_rom, size_t length)
 {
 	BUG();
 	return -1;
@@ -475,7 +487,7 @@ static int dummy_update_phy_reg(struct fw_card *card, int address,
 }
 
 static int dummy_set_config_rom(struct fw_card *card,
-				u32 *config_rom, size_t length)
+				const __be32 *config_rom, size_t length)
 {
 	/*
 	 * We take the card out of card_list before setting the dummy
diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h
index 7ff6e7585152..7adca7cb9f55 100644
--- a/drivers/firewire/core.h
+++ b/drivers/firewire/core.h
@@ -40,7 +40,8 @@ struct fw_card_driver {
 	 * enable the PHY or set the link_on bit and initiate a bus
 	 * reset.
 	 */
-	int (*enable)(struct fw_card *card, u32 *config_rom, size_t length);
+	int (*enable)(struct fw_card *card,
+		      const __be32 *config_rom, size_t length);
 
 	int (*update_phy_reg)(struct fw_card *card, int address,
 			      int clear_bits, int set_bits);
@@ -48,10 +49,10 @@ struct fw_card_driver {
 	/*
 	 * Update the config rom for an enabled card.  This function
 	 * should change the config rom that is presented on the bus
-	 * an initiate a bus reset.
+	 * and initiate a bus reset.
 	 */
 	int (*set_config_rom)(struct fw_card *card,
-			      u32 *config_rom, size_t length);
+			      const __be32 *config_rom, size_t length);
 
 	void (*send_request)(struct fw_card *card, struct fw_packet *packet);
 	void (*send_response)(struct fw_card *card, struct fw_packet *packet);
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index 5d524254499e..418415564791 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -205,7 +205,7 @@ struct fw_ohci {
 	dma_addr_t config_rom_bus;
 	__be32 *next_config_rom;
 	dma_addr_t next_config_rom_bus;
-	u32 next_header;
+	__be32 next_header;
 
 	struct ar_context ar_request_ctx;
 	struct ar_context ar_response_ctx;
@@ -1355,8 +1355,9 @@ static void bus_reset_tasklet(unsigned long data)
 		 */
 		reg_write(ohci, OHCI1394_BusOptions,
 			  be32_to_cpu(ohci->config_rom[2]));
-		ohci->config_rom[0] = cpu_to_be32(ohci->next_header);
-		reg_write(ohci, OHCI1394_ConfigROMhdr, ohci->next_header);
+		ohci->config_rom[0] = ohci->next_header;
+		reg_write(ohci, OHCI1394_ConfigROMhdr,
+			  be32_to_cpu(ohci->next_header));
 	}
 
 #ifdef CONFIG_FIREWIRE_OHCI_REMOTE_DMA
@@ -1464,7 +1465,17 @@ static int software_reset(struct fw_ohci *ohci)
 	return -EBUSY;
 }
 
-static int ohci_enable(struct fw_card *card, u32 *config_rom, size_t length)
+static void copy_config_rom(__be32 *dest, const __be32 *src, size_t length)
+{
+	size_t size = length * 4;
+
+	memcpy(dest, src, size);
+	if (size < CONFIG_ROM_SIZE)
+		memset(&dest[length], 0, CONFIG_ROM_SIZE - size);
+}
+
+static int ohci_enable(struct fw_card *card,
+		       const __be32 *config_rom, size_t length)
 {
 	struct fw_ohci *ohci = fw_ohci(card);
 	struct pci_dev *dev = to_pci_dev(card->device);
@@ -1565,8 +1576,7 @@ static int ohci_enable(struct fw_card *card, u32 *config_rom, size_t length)
 		if (ohci->next_config_rom == NULL)
 			return -ENOMEM;
 
-		memset(ohci->next_config_rom, 0, CONFIG_ROM_SIZE);
-		fw_memcpy_to_be32(ohci->next_config_rom, config_rom, length * 4);
+		copy_config_rom(ohci->next_config_rom, config_rom, length);
 	} else {
 		/*
 		 * In the suspend case, config_rom is NULL, which
@@ -1576,7 +1586,7 @@ static int ohci_enable(struct fw_card *card, u32 *config_rom, size_t length)
 		ohci->next_config_rom_bus = ohci->config_rom_bus;
 	}
 
-	ohci->next_header = be32_to_cpu(ohci->next_config_rom[0]);
+	ohci->next_header = ohci->next_config_rom[0];
 	ohci->next_config_rom[0] = 0;
 	reg_write(ohci, OHCI1394_ConfigROMhdr, 0);
 	reg_write(ohci, OHCI1394_BusOptions,
@@ -1610,7 +1620,7 @@ static int ohci_enable(struct fw_card *card, u32 *config_rom, size_t length)
 }
 
 static int ohci_set_config_rom(struct fw_card *card,
-			       u32 *config_rom, size_t length)
+			       const __be32 *config_rom, size_t length)
 {
 	struct fw_ohci *ohci;
 	unsigned long flags;
@@ -1659,9 +1669,7 @@ static int ohci_set_config_rom(struct fw_card *card,
 		ohci->next_config_rom = next_config_rom;
 		ohci->next_config_rom_bus = next_config_rom_bus;
 
-		memset(ohci->next_config_rom, 0, CONFIG_ROM_SIZE);
-		fw_memcpy_to_be32(ohci->next_config_rom, config_rom,
-				  length * 4);
+		copy_config_rom(ohci->next_config_rom, config_rom, length);
 
 		ohci->next_header = config_rom[0];
 		ohci->next_config_rom[0] = 0;
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 7e1d4dec83e7..53b9217de86c 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -20,20 +20,6 @@
 #define fw_notify(s, args...) printk(KERN_NOTICE KBUILD_MODNAME ": " s, ## args)
 #define fw_error(s, args...) printk(KERN_ERR KBUILD_MODNAME ": " s, ## args)
 
-static inline void fw_memcpy_from_be32(void *_dst, void *_src, size_t size)
-{
-	u32    *dst = _dst;
-	__be32 *src = _src;
-	int i;
-
-	for (i = 0; i < size / 4; i++)
-		dst[i] = be32_to_cpu(src[i]);
-}
-
-static inline void fw_memcpy_to_be32(void *_dst, void *_src, size_t size)
-{
-	fw_memcpy_from_be32(_dst, _src, size);
-}
 #define CSR_REGISTER_BASE		0xfffff0000000ULL
 
 /* register offsets are relative to CSR_REGISTER_BASE */
-- 
cgit v1.2.3


From cb7c96da3651111efbe088fa12f9bed61836ea93 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Thu, 8 Oct 2009 00:42:53 +0200
Subject: firewire: core: optimize Topology Map creation

The Topology Map of the local node was created in CPU byte order,
then a temporary big endian copy was created to compute the CRC,
and when a read request to the Topology Map arrived it had to be
converted to big endian byte order again.

We now generate it in big endian byte order in the first place.
This also rids us of 1000 bytes stack usage in tasklet context.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-card.c        | 17 ++---------------
 drivers/firewire/core-topology.c    | 17 ++++++++++-------
 drivers/firewire/core-transaction.c |  9 ++-------
 drivers/firewire/core.h             |  2 +-
 include/linux/firewire.h            |  2 +-
 5 files changed, 16 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c
index f58130789990..7083bcc1b9c7 100644
--- a/drivers/firewire/core-card.c
+++ b/drivers/firewire/core-card.c
@@ -38,7 +38,7 @@
 
 #include "core.h"
 
-static int __compute_block_crc(__be32 *block)
+int fw_compute_block_crc(__be32 *block)
 {
 	int length;
 	u16 crc;
@@ -50,19 +50,6 @@ static int __compute_block_crc(__be32 *block)
 	return length;
 }
 
-int fw_compute_block_crc(u32 *block)
-{
-	__be32 be32_block[256];
-	int i, length;
-
-	length = (*block >> 16) & 0xff;
-	for (i = 0; i < length; i++)
-		be32_block[i] = cpu_to_be32(block[i + 1]);
-	*block |= crc_itu_t(0, (u8 *) be32_block, length * 4);
-
-	return length;
-}
-
 static DEFINE_MUTEX(card_mutex);
 static LIST_HEAD(card_list);
 
@@ -141,7 +128,7 @@ static size_t generate_config_rom(struct fw_card *card, __be32 *config_rom)
 	 * the bus info block, which is always the case for this
 	 * implementation. */
 	for (i = 0; i < j; i += length + 1)
-		length = __compute_block_crc(config_rom + i);
+		length = fw_compute_block_crc(config_rom + i);
 
 	return j;
 }
diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c
index fddf2b358936..9a5f38c80b0e 100644
--- a/drivers/firewire/core-topology.c
+++ b/drivers/firewire/core-topology.c
@@ -28,9 +28,9 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
-#include <linux/string.h>
 
 #include <asm/atomic.h>
+#include <asm/byteorder.h>
 #include <asm/system.h>
 
 #include "core.h"
@@ -510,13 +510,16 @@ static void update_tree(struct fw_card *card, struct fw_node *root)
 static void update_topology_map(struct fw_card *card,
 				u32 *self_ids, int self_id_count)
 {
-	int node_count;
+	int node_count = (card->root_node->node_id & 0x3f) + 1;
+	__be32 *map = card->topology_map;
+
+	*map++ = cpu_to_be32((self_id_count + 2) << 16);
+	*map++ = cpu_to_be32(be32_to_cpu(card->topology_map[1]) + 1);
+	*map++ = cpu_to_be32((node_count << 16) | self_id_count);
+
+	while (self_id_count--)
+		*map++ = cpu_to_be32p(self_ids++);
 
-	card->topology_map[1]++;
-	node_count = (card->root_node->node_id & 0x3f) + 1;
-	card->topology_map[2] = (node_count << 16) | self_id_count;
-	card->topology_map[0] = (self_id_count + 2) << 16;
-	memcpy(&card->topology_map[3], self_ids, self_id_count * 4);
 	fw_compute_block_crc(card->topology_map);
 }
 
diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index da628c72a462..203e6428bada 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -810,8 +810,7 @@ static void handle_topology_map(struct fw_card *card, struct fw_request *request
 		int speed, unsigned long long offset,
 		void *payload, size_t length, void *callback_data)
 {
-	int i, start, end;
-	__be32 *map;
+	int start;
 
 	if (!TCODE_IS_READ_REQUEST(tcode)) {
 		fw_send_response(card, request, RCODE_TYPE_ERROR);
@@ -824,11 +823,7 @@ static void handle_topology_map(struct fw_card *card, struct fw_request *request
 	}
 
 	start = (offset - topology_map_region.start) / 4;
-	end = start + length / 4;
-	map = payload;
-
-	for (i = 0; i < length / 4; i++)
-		map[i] = cpu_to_be32(card->topology_map[start + i]);
+	memcpy(payload, &card->topology_map[start], length);
 
 	fw_send_response(card, request, RCODE_COMPLETE);
 }
diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h
index 7adca7cb9f55..ed3b1a765c00 100644
--- a/drivers/firewire/core.h
+++ b/drivers/firewire/core.h
@@ -94,7 +94,7 @@ int fw_card_add(struct fw_card *card,
 		u32 max_receive, u32 link_speed, u64 guid);
 void fw_core_remove_card(struct fw_card *card);
 int fw_core_initiate_bus_reset(struct fw_card *card, int short_reset);
-int fw_compute_block_crc(u32 *block);
+int fw_compute_block_crc(__be32 *block);
 void fw_schedule_bm_work(struct fw_card *card, unsigned long delay);
 
 static inline struct fw_card *fw_card_get(struct fw_card *card)
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 53b9217de86c..211a5d7d87b3 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -117,7 +117,7 @@ struct fw_card {
 
 	bool broadcast_channel_allocated;
 	u32 broadcast_channel;
-	u32 topology_map[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4];
+	__be32 topology_map[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4];
 };
 
 struct fw_attribute_group {
-- 
cgit v1.2.3


From 6fb2915df7f0747d9044da9dbff5b46dc2e20830 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 15 Oct 2009 11:21:42 +0800
Subject: tracing/profile: Add filter support

- Add an ioctl to allocate a filter for a perf event.

- Free the filter when the associated perf event is to be freed.

- Do the filtering in perf_swevent_match().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <4AD69546.8050401@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace_event.h       |  11 ++-
 include/linux/perf_counter.h       |   1 +
 include/linux/perf_event.h         |   6 ++
 kernel/perf_event.c                |  80 ++++++++++++++++++++--
 kernel/trace/trace.h               |   3 +-
 kernel/trace/trace_events_filter.c | 133 +++++++++++++++++++++++++++++--------
 6 files changed, 199 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 4ec5e67e18cf..d11770472bc8 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -144,7 +144,7 @@ extern char			*trace_profile_buf_nmi;
 #define MAX_FILTER_STR_VAL	256	/* Should handle KSYM_SYMBOL_LEN */
 
 extern void destroy_preds(struct ftrace_event_call *call);
-extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
+extern int filter_match_preds(struct event_filter *filter, void *rec);
 extern int filter_current_check_discard(struct ring_buffer *buffer,
 					struct ftrace_event_call *call,
 					void *rec,
@@ -186,4 +186,13 @@ do {									\
 		__trace_printk(ip, fmt, ##args);			\
 } while (0)
 
+#ifdef CONFIG_EVENT_PROFILE
+struct perf_event;
+extern int ftrace_profile_enable(int event_id);
+extern void ftrace_profile_disable(int event_id);
+extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
+				     char *filter_str);
+extern void ftrace_profile_free_filter(struct perf_event *event);
+#endif
+
 #endif /* _LINUX_FTRACE_EVENT_H */
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 7b7fbf433cff..91a2b4309e7a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -225,6 +225,7 @@ struct perf_counter_attr {
 #define PERF_COUNTER_IOC_RESET		_IO ('$', 3)
 #define PERF_COUNTER_IOC_PERIOD		_IOW('$', 4, u64)
 #define PERF_COUNTER_IOC_SET_OUTPUT	_IO ('$', 5)
+#define PERF_COUNTER_IOC_SET_FILTER	_IOW('$', 6, char *)
 
 enum perf_counter_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2e6d95f97419..df9d964c15fc 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -221,6 +221,7 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_RESET		_IO ('$', 3)
 #define PERF_EVENT_IOC_PERIOD		_IOW('$', 4, u64)
 #define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
+#define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
 
 enum perf_event_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,
@@ -633,7 +634,12 @@ struct perf_event {
 
 	struct pid_namespace		*ns;
 	u64				id;
+
+#ifdef CONFIG_EVENT_PROFILE
+	struct event_filter		*filter;
 #endif
+
+#endif /* CONFIG_PERF_EVENTS */
 };
 
 /**
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 9d0b5c665883..12b5ec39bf97 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -28,6 +28,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/kernel_stat.h>
 #include <linux/perf_event.h>
+#include <linux/ftrace_event.h>
 
 #include <asm/irq_regs.h>
 
@@ -1658,6 +1659,8 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
 	return ERR_PTR(err);
 }
 
+static void perf_event_free_filter(struct perf_event *event);
+
 static void free_event_rcu(struct rcu_head *head)
 {
 	struct perf_event *event;
@@ -1665,6 +1668,7 @@ static void free_event_rcu(struct rcu_head *head)
 	event = container_of(head, struct perf_event, rcu_head);
 	if (event->ns)
 		put_pid_ns(event->ns);
+	perf_event_free_filter(event);
 	kfree(event);
 }
 
@@ -1974,7 +1978,8 @@ unlock:
 	return ret;
 }
 
-int perf_event_set_output(struct perf_event *event, int output_fd);
+static int perf_event_set_output(struct perf_event *event, int output_fd);
+static int perf_event_set_filter(struct perf_event *event, void __user *arg);
 
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
@@ -2002,6 +2007,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case PERF_EVENT_IOC_SET_OUTPUT:
 		return perf_event_set_output(event, arg);
 
+	case PERF_EVENT_IOC_SET_FILTER:
+		return perf_event_set_filter(event, (void __user *)arg);
+
 	default:
 		return -ENOTTY;
 	}
@@ -3806,9 +3814,14 @@ static int perf_swevent_is_counting(struct perf_event *event)
 	return 1;
 }
 
+static int perf_tp_event_match(struct perf_event *event,
+				struct perf_sample_data *data);
+
 static int perf_swevent_match(struct perf_event *event,
 				enum perf_type_id type,
-				u32 event_id, struct pt_regs *regs)
+				u32 event_id,
+				struct perf_sample_data *data,
+				struct pt_regs *regs)
 {
 	if (!perf_swevent_is_counting(event))
 		return 0;
@@ -3826,6 +3839,10 @@ static int perf_swevent_match(struct perf_event *event,
 			return 0;
 	}
 
+	if (event->attr.type == PERF_TYPE_TRACEPOINT &&
+	    !perf_tp_event_match(event, data))
+		return 0;
+
 	return 1;
 }
 
@@ -3842,7 +3859,7 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
-		if (perf_swevent_match(event, type, event_id, regs))
+		if (perf_swevent_match(event, type, event_id, data, regs))
 			perf_swevent_add(event, nr, nmi, data, regs);
 	}
 	rcu_read_unlock();
@@ -4086,6 +4103,7 @@ static const struct pmu perf_ops_task_clock = {
 };
 
 #ifdef CONFIG_EVENT_PROFILE
+
 void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
 			  int entry_size)
 {
@@ -4109,8 +4127,15 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
 
-extern int ftrace_profile_enable(int);
-extern void ftrace_profile_disable(int);
+static int perf_tp_event_match(struct perf_event *event,
+				struct perf_sample_data *data)
+{
+	void *record = data->raw->data;
+
+	if (likely(!event->filter) || filter_match_preds(event->filter, record))
+		return 1;
+	return 0;
+}
 
 static void tp_perf_event_destroy(struct perf_event *event)
 {
@@ -4135,12 +4160,53 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
 
 	return &perf_ops_generic;
 }
+
+static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+	char *filter_str;
+	int ret;
+
+	if (event->attr.type != PERF_TYPE_TRACEPOINT)
+		return -EINVAL;
+
+	filter_str = strndup_user(arg, PAGE_SIZE);
+	if (IS_ERR(filter_str))
+		return PTR_ERR(filter_str);
+
+	ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+
+	kfree(filter_str);
+	return ret;
+}
+
+static void perf_event_free_filter(struct perf_event *event)
+{
+	ftrace_profile_free_filter(event);
+}
+
 #else
+
+static int perf_tp_event_match(struct perf_event *event,
+				struct perf_sample_data *data)
+{
+	return 1;
+}
+
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
 {
 	return NULL;
 }
-#endif
+
+static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+	return -ENOENT;
+}
+
+static void perf_event_free_filter(struct perf_event *event)
+{
+}
+
+#endif /* CONFIG_EVENT_PROFILE */
 
 atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
 
@@ -4394,7 +4460,7 @@ err_size:
 	goto out;
 }
 
-int perf_event_set_output(struct perf_event *event, int output_fd)
+static int perf_event_set_output(struct perf_event *event, int output_fd)
 {
 	struct perf_event *output_event = NULL;
 	struct file *output_file = NULL;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ffe53ddbe67a..4959ada9e0bb 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -743,7 +743,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 		     struct ring_buffer *buffer,
 		     struct ring_buffer_event *event)
 {
-	if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) {
+	if (unlikely(call->filter_active) &&
+	    !filter_match_preds(call->filter, rec)) {
 		ring_buffer_discard_commit(buffer, event);
 		return 1;
 	}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 273845fce393..e27bb6acc2dd 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -21,6 +21,7 @@
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/mutex.h>
+#include <linux/perf_event.h>
 
 #include "trace.h"
 #include "trace_output.h"
@@ -363,9 +364,8 @@ static void filter_build_regex(struct filter_pred *pred)
 }
 
 /* return 1 if event matches, 0 otherwise (discard) */
-int filter_match_preds(struct ftrace_event_call *call, void *rec)
+int filter_match_preds(struct event_filter *filter, void *rec)
 {
-	struct event_filter *filter = call->filter;
 	int match, top = 0, val1 = 0, val2 = 0;
 	int stack[MAX_FILTER_PRED];
 	struct filter_pred *pred;
@@ -538,9 +538,8 @@ static void filter_disable_preds(struct ftrace_event_call *call)
 		filter->preds[i]->fn = filter_pred_none;
 }
 
-void destroy_preds(struct ftrace_event_call *call)
+static void __free_preds(struct event_filter *filter)
 {
-	struct event_filter *filter = call->filter;
 	int i;
 
 	if (!filter)
@@ -553,21 +552,24 @@ void destroy_preds(struct ftrace_event_call *call)
 	kfree(filter->preds);
 	kfree(filter->filter_string);
 	kfree(filter);
+}
+
+void destroy_preds(struct ftrace_event_call *call)
+{
+	__free_preds(call->filter);
 	call->filter = NULL;
+	call->filter_active = 0;
 }
 
-static int init_preds(struct ftrace_event_call *call)
+static struct event_filter *__alloc_preds(void)
 {
 	struct event_filter *filter;
 	struct filter_pred *pred;
 	int i;
 
-	if (call->filter)
-		return 0;
-
-	filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
-	if (!call->filter)
-		return -ENOMEM;
+	filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+	if (!filter)
+		return ERR_PTR(-ENOMEM);
 
 	filter->n_preds = 0;
 
@@ -583,12 +585,24 @@ static int init_preds(struct ftrace_event_call *call)
 		filter->preds[i] = pred;
 	}
 
-	return 0;
+	return filter;
 
 oom:
-	destroy_preds(call);
+	__free_preds(filter);
+	return ERR_PTR(-ENOMEM);
+}
+
+static int init_preds(struct ftrace_event_call *call)
+{
+	if (call->filter)
+		return 0;
+
+	call->filter_active = 0;
+	call->filter = __alloc_preds();
+	if (IS_ERR(call->filter))
+		return PTR_ERR(call->filter);
 
-	return -ENOMEM;
+	return 0;
 }
 
 static int init_subsystem_preds(struct event_subsystem *system)
@@ -629,10 +643,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
 
 static int filter_add_pred_fn(struct filter_parse_state *ps,
 			      struct ftrace_event_call *call,
+			      struct event_filter *filter,
 			      struct filter_pred *pred,
 			      filter_pred_fn_t fn)
 {
-	struct event_filter *filter = call->filter;
 	int idx, err;
 
 	if (filter->n_preds == MAX_FILTER_PRED) {
@@ -647,7 +661,6 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
 		return err;
 
 	filter->n_preds++;
-	call->filter_active = 1;
 
 	return 0;
 }
@@ -726,6 +739,7 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
 
 static int filter_add_pred(struct filter_parse_state *ps,
 			   struct ftrace_event_call *call,
+			   struct event_filter *filter,
 			   struct filter_pred *pred,
 			   bool dry_run)
 {
@@ -795,7 +809,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
 
 add_pred_fn:
 	if (!dry_run)
-		return filter_add_pred_fn(ps, call, pred, fn);
+		return filter_add_pred_fn(ps, call, filter, pred, fn);
 	return 0;
 }
 
@@ -1154,6 +1168,7 @@ static int check_preds(struct filter_parse_state *ps)
 }
 
 static int replace_preds(struct ftrace_event_call *call,
+			 struct event_filter *filter,
 			 struct filter_parse_state *ps,
 			 char *filter_string,
 			 bool dry_run)
@@ -1200,7 +1215,7 @@ static int replace_preds(struct ftrace_event_call *call,
 add_pred:
 		if (!pred)
 			return -ENOMEM;
-		err = filter_add_pred(ps, call, pred, dry_run);
+		err = filter_add_pred(ps, call, filter, pred, dry_run);
 		filter_free_pred(pred);
 		if (err)
 			return err;
@@ -1216,6 +1231,7 @@ static int replace_system_preds(struct event_subsystem *system,
 				char *filter_string)
 {
 	struct ftrace_event_call *call;
+	struct event_filter *filter;
 	int err;
 	bool fail = true;
 
@@ -1228,17 +1244,19 @@ static int replace_system_preds(struct event_subsystem *system,
 			continue;
 
 		/* try to see if the filter can be applied */
-		err = replace_preds(call, ps, filter_string, true);
+		err = replace_preds(call, filter, ps, filter_string, true);
 		if (err)
 			continue;
 
 		/* really apply the filter */
 		filter_disable_preds(call);
-		err = replace_preds(call, ps, filter_string, false);
+		err = replace_preds(call, filter, ps, filter_string, false);
 		if (err)
 			filter_disable_preds(call);
-		else
-			replace_filter_string(call->filter, filter_string);
+		else {
+			call->filter_active = 1;
+			replace_filter_string(filter, filter_string);
+		}
 		fail = false;
 	}
 
@@ -1252,7 +1270,6 @@ static int replace_system_preds(struct event_subsystem *system,
 int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 {
 	int err;
-
 	struct filter_parse_state *ps;
 
 	mutex_lock(&event_mutex);
@@ -1283,10 +1300,11 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 		goto out;
 	}
 
-	err = replace_preds(call, ps, filter_string, false);
+	err = replace_preds(call, call->filter, ps, filter_string, false);
 	if (err)
 		append_filter_err(ps, call->filter);
-
+	else
+		call->filter_active = 1;
 out:
 	filter_opstack_clear(ps);
 	postfix_clear(ps);
@@ -1301,7 +1319,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
 				 char *filter_string)
 {
 	int err;
-
 	struct filter_parse_state *ps;
 
 	mutex_lock(&event_mutex);
@@ -1345,3 +1362,67 @@ out_unlock:
 	return err;
 }
 
+#ifdef CONFIG_EVENT_PROFILE
+
+void ftrace_profile_free_filter(struct perf_event *event)
+{
+	struct event_filter *filter = event->filter;
+
+	event->filter = NULL;
+	__free_preds(filter);
+}
+
+int ftrace_profile_set_filter(struct perf_event *event, int event_id,
+			      char *filter_str)
+{
+	int err;
+	struct event_filter *filter;
+	struct filter_parse_state *ps;
+	struct ftrace_event_call *call = NULL;
+
+	mutex_lock(&event_mutex);
+
+	list_for_each_entry(call, &ftrace_events, list) {
+		if (call->id == event_id)
+			break;
+	}
+	if (!call)
+		return -EINVAL;
+
+	if (event->filter)
+		return -EEXIST;
+
+	filter = __alloc_preds();
+	if (IS_ERR(filter))
+		return PTR_ERR(filter);
+
+	err = -ENOMEM;
+	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+	if (!ps)
+		goto free_preds;
+
+	parse_init(ps, filter_ops, filter_str);
+	err = filter_parse(ps);
+	if (err)
+		goto free_ps;
+
+	err = replace_preds(call, filter, ps, filter_str, false);
+	if (!err)
+		event->filter = filter;
+
+free_ps:
+	filter_opstack_clear(ps);
+	postfix_clear(ps);
+	kfree(ps);
+
+free_preds:
+	if (err)
+		__free_preds(filter);
+
+	mutex_unlock(&event_mutex);
+
+	return err;
+}
+
+#endif /* CONFIG_EVENT_PROFILE */
+
-- 
cgit v1.2.3


From 731581e6a653f6a68a4d7ba9df6b886a85c7d080 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Thu, 15 Oct 2009 10:57:46 -0600
Subject: of: merge phandle, ihandle and struct property

Merge of common code duplicated between Sparc, PowerPC and Microblaze

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Wolfram Sang <w.sang@pengutronix.de>
Acked-by: Michal Simek <monstr@monstr.eu>
Acked-by: Stephen Neuendorffer <stephen.neuendorffer@xilinx.com>
Acked-by: Stephen Rothwell <sfr@canb.auug.org.au>
---
 arch/microblaze/include/asm/prom.h | 10 ----------
 arch/powerpc/include/asm/prom.h    | 12 ------------
 arch/sparc/include/asm/prom.h      | 12 ------------
 include/linux/of.h                 | 12 ++++++++++++
 4 files changed, 12 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index 66368896f922..11cb48419c7d 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -73,16 +73,6 @@ struct boot_param_header {
 	u32	dt_struct_size; /* size of the DT structure block */
 };
 
-typedef u32 phandle;
-typedef u32 ihandle;
-
-struct property {
-	char	*name;
-	int	length;
-	void	*value;
-	struct property *next;
-};
-
 struct device_node {
 	const char *name;
 	const char *type;
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index b0a84ea5f8ed..c236326177a8 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -71,18 +71,6 @@ struct boot_param_header
 	u32	dt_struct_size;		/* size of the DT structure block */
 };
 
-
-
-typedef u32 phandle;
-typedef u32 ihandle;
-
-struct property {
-	char	*name;
-	int	length;
-	void	*value;
-	struct property *next;
-};
-
 struct device_node {
 	const char *name;
 	const char *type;
diff --git a/arch/sparc/include/asm/prom.h b/arch/sparc/include/asm/prom.h
index 0733170e02ca..b34f988a2aad 100644
--- a/arch/sparc/include/asm/prom.h
+++ b/arch/sparc/include/asm/prom.h
@@ -29,18 +29,6 @@
 #define of_prop_cmp(s1, s2)		strcasecmp((s1), (s2))
 #define of_node_cmp(s1, s2)		strcmp((s1), (s2))
 
-typedef u32 phandle;
-typedef u32 ihandle;
-
-struct property {
-	char	*name;
-	int	length;
-	void	*value;
-	struct property *next;
-	unsigned long _flags;
-	unsigned int unique_id;
-};
-
 struct of_irq_controller;
 struct device_node {
 	const char	*name;
diff --git a/include/linux/of.h b/include/linux/of.h
index 7be2d1043c16..4668b298479a 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -19,6 +19,18 @@
 #include <linux/bitops.h>
 #include <linux/mod_devicetable.h>
 
+typedef u32 phandle;
+typedef u32 ihandle;
+
+struct property {
+	char	*name;
+	int	length;
+	void	*value;
+	struct property *next;
+	unsigned long _flags;
+	unsigned int unique_id;
+};
+
 #include <asm/prom.h>
 
 /* flag descriptions */
-- 
cgit v1.2.3


From 6f1924928377bd035a9f64466f91a487c69271d2 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Thu, 15 Oct 2009 10:57:49 -0600
Subject: of: merge struct device_node

Merge of common code duplicated between Sparc, PowerPC and Microblaze

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Wolfram Sang <w.sang@pengutronix.de>
Acked-by: Michal Simek <monstr@monstr.eu>
Acked-by: Stephen Neuendorffer <stephen.neuendorffer@xilinx.com>
Acked-by: Stephen Rothwell <sfr@canb.auug.org.au>
---
 arch/microblaze/include/asm/prom.h | 20 --------------------
 arch/powerpc/include/asm/prom.h    | 20 --------------------
 arch/sparc/include/asm/prom.h      | 24 ------------------------
 include/linux/of.h                 | 32 ++++++++++++++++++++++++++++++++
 4 files changed, 32 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index 11cb48419c7d..64e8b3a8c3cf 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -73,26 +73,6 @@ struct boot_param_header {
 	u32	dt_struct_size; /* size of the DT structure block */
 };
 
-struct device_node {
-	const char *name;
-	const char *type;
-	phandle	node;
-	phandle linux_phandle;
-	char	*full_name;
-
-	struct	property *properties;
-	struct	property *deadprops; /* removed properties */
-	struct	device_node *parent;
-	struct	device_node *child;
-	struct	device_node *sibling;
-	struct	device_node *next; /* next device of same type */
-	struct	device_node *allnext; /* next in list of all nodes */
-	struct	proc_dir_entry *pde; /* this node's proc directory */
-	struct	kref kref;
-	unsigned long _flags;
-	void	*data;
-};
-
 extern struct device_node *of_chosen;
 
 static inline int of_node_check_flag(struct device_node *n, unsigned long flag)
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index c236326177a8..c918db535f08 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -71,26 +71,6 @@ struct boot_param_header
 	u32	dt_struct_size;		/* size of the DT structure block */
 };
 
-struct device_node {
-	const char *name;
-	const char *type;
-	phandle	node;
-	phandle linux_phandle;
-	char	*full_name;
-
-	struct	property *properties;
-	struct  property *deadprops; /* removed properties */
-	struct	device_node *parent;
-	struct	device_node *child;
-	struct	device_node *sibling;
-	struct	device_node *next;	/* next device of same type */
-	struct	device_node *allnext;	/* next in list of all nodes */
-	struct  proc_dir_entry *pde;	/* this node's proc directory */
-	struct  kref kref;
-	unsigned long _flags;
-	void	*data;
-};
-
 extern struct device_node *of_chosen;
 
 static inline int of_node_check_flag(struct device_node *n, unsigned long flag)
diff --git a/arch/sparc/include/asm/prom.h b/arch/sparc/include/asm/prom.h
index b34f988a2aad..e5f4a1d8fc46 100644
--- a/arch/sparc/include/asm/prom.h
+++ b/arch/sparc/include/asm/prom.h
@@ -29,30 +29,6 @@
 #define of_prop_cmp(s1, s2)		strcasecmp((s1), (s2))
 #define of_node_cmp(s1, s2)		strcmp((s1), (s2))
 
-struct of_irq_controller;
-struct device_node {
-	const char	*name;
-	const char	*type;
-	phandle	node;
-	char	*path_component_name;
-	char	*full_name;
-
-	struct	property *properties;
-	struct  property *deadprops; /* removed properties */
-	struct	device_node *parent;
-	struct	device_node *child;
-	struct	device_node *sibling;
-	struct	device_node *next;	/* next device of same type */
-	struct	device_node *allnext;	/* next in list of all nodes */
-	struct  proc_dir_entry *pde;	/* this node's proc directory */
-	struct  kref kref;
-	unsigned long _flags;
-	void	*data;
-	unsigned int unique_id;
-
-	struct of_irq_controller *irq_trans;
-};
-
 struct of_irq_controller {
 	unsigned int	(*irq_build)(struct device_node *, unsigned int, void *);
 	void		*data;
diff --git a/include/linux/of.h b/include/linux/of.h
index 4668b298479a..65a158dc7257 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -17,6 +17,7 @@
  */
 #include <linux/types.h>
 #include <linux/bitops.h>
+#include <linux/kref.h>
 #include <linux/mod_devicetable.h>
 
 typedef u32 phandle;
@@ -31,6 +32,37 @@ struct property {
 	unsigned int unique_id;
 };
 
+#if defined(CONFIG_SPARC)
+struct of_irq_controller;
+#endif
+
+struct device_node {
+	const char *name;
+	const char *type;
+	phandle	node;
+#if !defined(CONFIG_SPARC)
+	phandle linux_phandle;
+#endif
+	char	*full_name;
+
+	struct	property *properties;
+	struct	property *deadprops;	/* removed properties */
+	struct	device_node *parent;
+	struct	device_node *child;
+	struct	device_node *sibling;
+	struct	device_node *next;	/* next device of same type */
+	struct	device_node *allnext;	/* next in list of all nodes */
+	struct	proc_dir_entry *pde;	/* this node's proc directory */
+	struct	kref kref;
+	unsigned long _flags;
+	void	*data;
+#if defined(CONFIG_SPARC)
+	char	*path_component_name;
+	unsigned int unique_id;
+	struct of_irq_controller *irq_trans;
+#endif
+};
+
 #include <asm/prom.h>
 
 /* flag descriptions */
-- 
cgit v1.2.3


From 61e955db539e748cff2b8ea3bf7705259ebe9fb6 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Thu, 15 Oct 2009 10:57:51 -0600
Subject: of: Move OF_IS_DYNAMIC and OF_MARK_DYNAMIC macros to of.h

Merge of common code duplicated between Sparc, PowerPC and Microblaze

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Wolfram Sang <w.sang@pengutronix.de>
Acked-by: Michal Simek <monstr@monstr.eu>
Acked-by: Stephen Neuendorffer <stephen.neuendorffer@xilinx.com>
Acked-by: Stephen Rothwell <sfr@canb.auug.org.au>
---
 arch/sparc/include/asm/prom.h | 3 ---
 include/linux/of.h            | 3 +++
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sparc/include/asm/prom.h b/arch/sparc/include/asm/prom.h
index e5f4a1d8fc46..ddbd870b5720 100644
--- a/arch/sparc/include/asm/prom.h
+++ b/arch/sparc/include/asm/prom.h
@@ -34,9 +34,6 @@ struct of_irq_controller {
 	void		*data;
 };
 
-#define OF_IS_DYNAMIC(x) test_bit(OF_DYNAMIC, &x->_flags)
-#define OF_MARK_DYNAMIC(x) set_bit(OF_DYNAMIC, &x->_flags)
-
 extern struct device_node *of_find_node_by_cpuid(int cpuid);
 extern int of_set_property(struct device_node *node, const char *name, void *val, int len);
 extern struct mutex of_set_property_mutex;
diff --git a/include/linux/of.h b/include/linux/of.h
index 65a158dc7257..a66c1eb31693 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -69,6 +69,9 @@ struct device_node {
 #define OF_DYNAMIC	1 /* node and properties were allocated via kmalloc */
 #define OF_DETACHED	2 /* node has been detached from the device tree */
 
+#define OF_IS_DYNAMIC(x) test_bit(OF_DYNAMIC, &x->_flags)
+#define OF_MARK_DYNAMIC(x) set_bit(OF_DYNAMIC, &x->_flags)
+
 #define OF_BAD_ADDR	((u64)-1)
 
 extern struct device_node *of_find_node_by_name(struct device_node *from,
-- 
cgit v1.2.3


From d8678b58708d7e6bf947ebd03eaf44baf2adfad8 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Thu, 15 Oct 2009 10:57:53 -0600
Subject: of: add common header for flattened device tree representation

Add a common header file for working with the flattened device tree
data structure and merge the shared data tags used by Microblaze and
PowerPC

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Wolfram Sang <w.sang@pengutronix.de>
Acked-by: Michal Simek <monstr@monstr.eu>
Acked-by: Stephen Neuendorffer <stephen.neuendorffer@xilinx.com>
Acked-by: Stephen Rothwell <sfr@canb.auug.org.au>
---
 arch/microblaze/include/asm/prom.h | 12 +-----------
 arch/microblaze/kernel/head.S      |  2 +-
 arch/powerpc/include/asm/prom.h    | 12 +-----------
 include/linux/of_fdt.h             | 26 ++++++++++++++++++++++++++
 4 files changed, 29 insertions(+), 23 deletions(-)
 create mode 100644 include/linux/of_fdt.h

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index 64e8b3a8c3cf..5f461f08db11 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -17,20 +17,10 @@
 #ifndef _ASM_MICROBLAZE_PROM_H
 #define _ASM_MICROBLAZE_PROM_H
 #ifdef __KERNEL__
-
-/* Definitions used by the flattened device tree */
-#define OF_DT_HEADER		0xd00dfeed /* marker */
-#define OF_DT_BEGIN_NODE	0x1 /* Start of node, full name */
-#define OF_DT_END_NODE		0x2 /* End node */
-#define OF_DT_PROP		0x3 /* Property: name off, size, content */
-#define OF_DT_NOP		0x4 /* nop */
-#define OF_DT_END		0x9
-
-#define OF_DT_VERSION		0x10
-
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
+#include <linux/of_fdt.h>
 #include <linux/proc_fs.h>
 #include <linux/platform_device.h>
 #include <asm/irq.h>
diff --git a/arch/microblaze/kernel/head.S b/arch/microblaze/kernel/head.S
index 697ce3007f30..30916193fcc7 100644
--- a/arch/microblaze/kernel/head.S
+++ b/arch/microblaze/kernel/head.S
@@ -31,7 +31,7 @@
 #include <linux/linkage.h>
 #include <asm/thread_info.h>
 #include <asm/page.h>
-#include <asm/prom.h>		/* for OF_DT_HEADER */
+#include <linux/of_fdt.h>		/* for OF_DT_HEADER */
 
 #ifdef CONFIG_MMU
 #include <asm/setup.h> /* COMMAND_LINE_SIZE */
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index c918db535f08..7181f8ac40f9 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -17,6 +17,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 #include <linux/types.h>
+#include <linux/of_fdt.h>
 #include <linux/proc_fs.h>
 #include <linux/platform_device.h>
 #include <asm/irq.h>
@@ -29,17 +30,6 @@
 #define of_prop_cmp(s1, s2)		strcmp((s1), (s2))
 #define of_node_cmp(s1, s2)		strcasecmp((s1), (s2))
 
-/* Definitions used by the flattened device tree */
-#define OF_DT_HEADER		0xd00dfeed	/* marker */
-#define OF_DT_BEGIN_NODE	0x1		/* Start of node, full name */
-#define OF_DT_END_NODE		0x2		/* End node */
-#define OF_DT_PROP		0x3		/* Property: name off, size,
-						 * content */
-#define OF_DT_NOP		0x4		/* nop */
-#define OF_DT_END		0x9
-
-#define OF_DT_VERSION		0x10
-
 /*
  * This is what gets passed to the kernel by prom_init or kexec
  *
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
new file mode 100644
index 000000000000..8b5ecc1cb6aa
--- /dev/null
+++ b/include/linux/of_fdt.h
@@ -0,0 +1,26 @@
+/*
+ * Definitions for working with the Flattened Device Tree data format
+ *
+ * Copyright 2009 Benjamin Herrenschmidt, IBM Corp
+ * benh@kernel.crashing.org
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#ifndef _LINUX_OF_FDT_H
+#define _LINUX_OF_FDT_H
+
+/* Definitions used by the flattened device tree */
+#define OF_DT_HEADER		0xd00dfeed	/* marker */
+#define OF_DT_BEGIN_NODE	0x1		/* Start of node, full name */
+#define OF_DT_END_NODE		0x2		/* End node */
+#define OF_DT_PROP		0x3		/* Property: name off, size,
+						 * content */
+#define OF_DT_NOP		0x4		/* nop */
+#define OF_DT_END		0x9
+
+#define OF_DT_VERSION		0x10
+
+#endif /* _LINUX_OF_FDT_H */
-- 
cgit v1.2.3


From d45d94f672e3c79b0db1e6d76e1638ee521d56c0 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Thu, 15 Oct 2009 10:57:55 -0600
Subject: of: merge struct boot_param_header from Microblaze and PowerPC

Merge common code for working with Flattened Device Tree data structure

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Wolfram Sang <w.sang@pengutronix.de>
Acked-by: Michal Simek <monstr@monstr.eu>
Acked-by: Stephen Neuendorffer <stephen.neuendorffer@xilinx.com>
Acked-by: Stephen Rothwell <sfr@canb.auug.org.au>
---
 arch/microblaze/include/asm/prom.h | 30 ------------------------------
 arch/powerpc/include/asm/prom.h    | 31 -------------------------------
 include/linux/of_fdt.h             | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 32 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index 5f461f08db11..dfc4afcdbd2b 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -33,36 +33,6 @@
 #define of_prop_cmp(s1, s2)		strcmp((s1), (s2))
 #define of_node_cmp(s1, s2)		strcasecmp((s1), (s2))
 
-/*
- * This is what gets passed to the kernel by prom_init or kexec
- *
- * The dt struct contains the device tree structure, full pathes and
- * property contents. The dt strings contain a separate block with just
- * the strings for the property names, and is fully page aligned and
- * self contained in a page, so that it can be kept around by the kernel,
- * each property name appears only once in this page (cheap compression)
- *
- * the mem_rsvmap contains a map of reserved ranges of physical memory,
- * passing it here instead of in the device-tree itself greatly simplifies
- * the job of everybody. It's just a list of u64 pairs (base/size) that
- * ends when size is 0
- */
-struct boot_param_header {
-	u32	magic; /* magic word OF_DT_HEADER */
-	u32	totalsize; /* total size of DT block */
-	u32	off_dt_struct; /* offset to structure */
-	u32	off_dt_strings; /* offset to strings */
-	u32	off_mem_rsvmap; /* offset to memory reserve map */
-	u32	version; /* format version */
-	u32	last_comp_version; /* last compatible version */
-	/* version 2 fields below */
-	u32	boot_cpuid_phys; /* Physical CPU id we're booting on */
-	/* version 3 fields below */
-	u32	dt_strings_size; /* size of the DT strings block */
-	/* version 17 fields below */
-	u32	dt_struct_size; /* size of the DT structure block */
-};
-
 extern struct device_node *of_chosen;
 
 static inline int of_node_check_flag(struct device_node *n, unsigned long flag)
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 7181f8ac40f9..ef20e6c23235 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -30,37 +30,6 @@
 #define of_prop_cmp(s1, s2)		strcmp((s1), (s2))
 #define of_node_cmp(s1, s2)		strcasecmp((s1), (s2))
 
-/*
- * This is what gets passed to the kernel by prom_init or kexec
- *
- * The dt struct contains the device tree structure, full pathes and
- * property contents. The dt strings contain a separate block with just
- * the strings for the property names, and is fully page aligned and
- * self contained in a page, so that it can be kept around by the kernel,
- * each property name appears only once in this page (cheap compression)
- *
- * the mem_rsvmap contains a map of reserved ranges of physical memory,
- * passing it here instead of in the device-tree itself greatly simplifies
- * the job of everybody. It's just a list of u64 pairs (base/size) that
- * ends when size is 0
- */
-struct boot_param_header
-{
-	u32	magic;			/* magic word OF_DT_HEADER */
-	u32	totalsize;		/* total size of DT block */
-	u32	off_dt_struct;		/* offset to structure */
-	u32	off_dt_strings;		/* offset to strings */
-	u32	off_mem_rsvmap;		/* offset to memory reserve map */
-	u32	version;		/* format version */
-	u32	last_comp_version;	/* last compatible version */
-	/* version 2 fields below */
-	u32	boot_cpuid_phys;	/* Physical CPU id we're booting on */
-	/* version 3 fields below */
-	u32	dt_strings_size;	/* size of the DT strings block */
-	/* version 17 fields below */
-	u32	dt_struct_size;		/* size of the DT structure block */
-};
-
 extern struct device_node *of_chosen;
 
 static inline int of_node_check_flag(struct device_node *n, unsigned long flag)
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index 8b5ecc1cb6aa..b37ad3a973b9 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -23,4 +23,36 @@
 
 #define OF_DT_VERSION		0x10
 
+#ifndef __ASSEMBLY__
+/*
+ * This is what gets passed to the kernel by prom_init or kexec
+ *
+ * The dt struct contains the device tree structure, full pathes and
+ * property contents. The dt strings contain a separate block with just
+ * the strings for the property names, and is fully page aligned and
+ * self contained in a page, so that it can be kept around by the kernel,
+ * each property name appears only once in this page (cheap compression)
+ *
+ * the mem_rsvmap contains a map of reserved ranges of physical memory,
+ * passing it here instead of in the device-tree itself greatly simplifies
+ * the job of everybody. It's just a list of u64 pairs (base/size) that
+ * ends when size is 0
+ */
+struct boot_param_header {
+	u32	magic;			/* magic word OF_DT_HEADER */
+	u32	totalsize;		/* total size of DT block */
+	u32	off_dt_struct;		/* offset to structure */
+	u32	off_dt_strings;		/* offset to strings */
+	u32	off_mem_rsvmap;		/* offset to memory reserve map */
+	u32	version;		/* format version */
+	u32	last_comp_version;	/* last compatible version */
+	/* version 2 fields below */
+	u32	boot_cpuid_phys;	/* Physical CPU id we're booting on */
+	/* version 3 fields below */
+	u32	dt_strings_size;	/* size of the DT strings block */
+	/* version 17 fields below */
+	u32	dt_struct_size;		/* size of the DT structure block */
+};
+
+#endif /* __ASSEMBLY__ */
 #endif /* _LINUX_OF_FDT_H */
-- 
cgit v1.2.3


From 50436312f47f1fd2bf82c983638fe27ca7e03238 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Thu, 15 Oct 2009 10:57:58 -0600
Subject: of: merge of_node_*_flag() and set_node_proc_entry()

Merge common code between PowerPC and Microblaze

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Wolfram Sang <w.sang@pengutronix.de>
Acked-by: Michal Simek <monstr@monstr.eu>
Acked-by: Stephen Neuendorffer <stephen.neuendorffer@xilinx.com>
Acked-by: Stephen Rothwell <sfr@canb.auug.org.au>
---
 arch/microblaze/include/asm/prom.h | 16 ----------------
 arch/powerpc/include/asm/prom.h    | 17 -----------------
 include/linux/of.h                 | 16 ++++++++++++++++
 3 files changed, 16 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index dfc4afcdbd2b..180d84481306 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -35,24 +35,8 @@
 
 extern struct device_node *of_chosen;
 
-static inline int of_node_check_flag(struct device_node *n, unsigned long flag)
-{
-	return test_bit(flag, &n->_flags);
-}
-
-static inline void of_node_set_flag(struct device_node *n, unsigned long flag)
-{
-	set_bit(flag, &n->_flags);
-}
-
 #define HAVE_ARCH_DEVTREE_FIXUPS
 
-static inline void set_node_proc_entry(struct device_node *dn,
-					struct proc_dir_entry *de)
-{
-	dn->pde = de;
-}
-
 extern struct device_node *allnodes;	/* temporary while merging */
 extern rwlock_t devtree_lock;	/* temporary while merging */
 
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index ef20e6c23235..2cfd43288a3e 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -32,25 +32,8 @@
 
 extern struct device_node *of_chosen;
 
-static inline int of_node_check_flag(struct device_node *n, unsigned long flag)
-{
-	return test_bit(flag, &n->_flags);
-}
-
-static inline void of_node_set_flag(struct device_node *n, unsigned long flag)
-{
-	set_bit(flag, &n->_flags);
-}
-
-
 #define HAVE_ARCH_DEVTREE_FIXUPS
 
-static inline void set_node_proc_entry(struct device_node *dn, struct proc_dir_entry *de)
-{
-	dn->pde = de;
-}
-
-
 extern struct device_node *of_find_all_nodes(struct device_node *prev);
 extern struct device_node *of_node_get(struct device_node *node);
 extern void of_node_put(struct device_node *node);
diff --git a/include/linux/of.h b/include/linux/of.h
index a66c1eb31693..d5f666290f6b 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -63,6 +63,22 @@ struct device_node {
 #endif
 };
 
+static inline int of_node_check_flag(struct device_node *n, unsigned long flag)
+{
+	return test_bit(flag, &n->_flags);
+}
+
+static inline void of_node_set_flag(struct device_node *n, unsigned long flag)
+{
+	set_bit(flag, &n->_flags);
+}
+
+static inline void
+set_node_proc_entry(struct device_node *dn, struct proc_dir_entry *de)
+{
+	dn->pde = de;
+}
+
 #include <asm/prom.h>
 
 /* flag descriptions */
-- 
cgit v1.2.3


From b6caf2ad7ce30648b89c1cf40d8f7cf6f4b58033 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Thu, 15 Oct 2009 10:58:00 -0600
Subject: of: merge of_read_number() an of_read_ulong()

Merge common code between Microblaze and PowerPC

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Wolfram Sang <w.sang@pengutronix.de>
Acked-by: Michal Simek <monstr@monstr.eu>
Acked-by: Stephen Neuendorffer <stephen.neuendorffer@xilinx.com>
Acked-by: Stephen Rothwell <sfr@canb.auug.org.au>
---
 arch/microblaze/include/asm/prom.h | 12 ------------
 arch/powerpc/include/asm/prom.h    | 20 --------------------
 include/linux/of.h                 | 23 +++++++++++++++++++++++
 3 files changed, 23 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index 180d84481306..d4f57ffdae3f 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -82,18 +82,6 @@ extern int release_OF_resource(struct device_node *node, int index);
  * OF address retreival & translation
  */
 
-/* Helper to read a big number; size is in cells (not bytes) */
-static inline u64 of_read_number(const u32 *cell, int size)
-{
-	u64 r = 0;
-	while (size--)
-		r = (r << 32) | *(cell++);
-	return r;
-}
-
-/* Like of_read_number, but we want an unsigned long result */
-#define of_read_ulong(cell, size)	of_read_number(cell, size)
-
 /* Translate an OF address block into a CPU physical address
  */
 extern u64 of_translate_address(struct device_node *np, const u32 *addr);
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 2cfd43288a3e..d8c0525c3139 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -89,26 +89,6 @@ extern int release_OF_resource(struct device_node* node, int index);
  * OF address retreival & translation
  */
 
-
-/* Helper to read a big number; size is in cells (not bytes) */
-static inline u64 of_read_number(const u32 *cell, int size)
-{
-	u64 r = 0;
-	while (size--)
-		r = (r << 32) | *(cell++);
-	return r;
-}
-
-/* Like of_read_number, but we want an unsigned long result */
-#ifdef CONFIG_PPC32
-static inline unsigned long of_read_ulong(const u32 *cell, int size)
-{
-	return cell[size-1];
-}
-#else
-#define of_read_ulong(cell, size)	of_read_number(cell, size)
-#endif
-
 /* Translate an OF address block into a CPU physical address
  */
 extern u64 of_translate_address(struct device_node *np, const u32 *addr);
diff --git a/include/linux/of.h b/include/linux/of.h
index d5f666290f6b..18e4379b8b7f 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -79,6 +79,29 @@ set_node_proc_entry(struct device_node *dn, struct proc_dir_entry *de)
 	dn->pde = de;
 }
 
+/*
+ * OF address retreival & translation
+ */
+
+/* Helper to read a big number; size is in cells (not bytes) */
+static inline u64 of_read_number(const u32 *cell, int size)
+{
+	u64 r = 0;
+	while (size--)
+		r = (r << 32) | *(cell++);
+	return r;
+}
+
+/* Like of_read_number, but we want an unsigned long result */
+#ifdef CONFIG_PPC32
+static inline unsigned long of_read_ulong(const u32 *cell, int size)
+{
+	return cell[size-1];
+}
+#else
+#define of_read_ulong(cell, size)	of_read_number(cell, size)
+#endif
+
 #include <asm/prom.h>
 
 /* flag descriptions */
-- 
cgit v1.2.3


From 526b5b3ed97bac22ed0c9feed97adcdc3a25244c Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Thu, 15 Oct 2009 10:58:02 -0600
Subject: of: merge of_node_get(), of_node_put() and of_find_all_nodes()

Merge common code between Sparc, PowerPC and Microblaze.

Sparc differs in the implementation at this point, so this patch uses
a #ifdef to handle sparc differently for now.  The merging of
implementations will occur in a later patch

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Wolfram Sang <w.sang@pengutronix.de>
Acked-by: Michal Simek <monstr@monstr.eu>
Acked-by: Stephen Neuendorffer <stephen.neuendorffer@xilinx.com>
Acked-by: Stephen Rothwell <sfr@canb.auug.org.au>
---
 arch/microblaze/include/asm/prom.h |  4 ----
 arch/powerpc/include/asm/prom.h    |  4 ----
 arch/sparc/include/asm/prom.h      |  9 ---------
 include/linux/of.h                 | 16 ++++++++++++++++
 4 files changed, 16 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index d4f57ffdae3f..c92b4a9e4397 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -40,10 +40,6 @@ extern struct device_node *of_chosen;
 extern struct device_node *allnodes;	/* temporary while merging */
 extern rwlock_t devtree_lock;	/* temporary while merging */
 
-extern struct device_node *of_find_all_nodes(struct device_node *prev);
-extern struct device_node *of_node_get(struct device_node *node);
-extern void of_node_put(struct device_node *node);
-
 /* For scanning the flat device-tree at boot time */
 extern int __init of_scan_flat_dt(int (*it)(unsigned long node,
 					const char *uname, int depth,
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index d8c0525c3139..622769cd1d62 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -34,10 +34,6 @@ extern struct device_node *of_chosen;
 
 #define HAVE_ARCH_DEVTREE_FIXUPS
 
-extern struct device_node *of_find_all_nodes(struct device_node *prev);
-extern struct device_node *of_node_get(struct device_node *node);
-extern void of_node_put(struct device_node *node);
-
 /* For scanning the flat device-tree at boot time */
 extern int __init of_scan_flat_dt(int (*it)(unsigned long node,
 					    const char *uname, int depth,
diff --git a/arch/sparc/include/asm/prom.h b/arch/sparc/include/asm/prom.h
index ddbd870b5720..f845828ca4c6 100644
--- a/arch/sparc/include/asm/prom.h
+++ b/arch/sparc/include/asm/prom.h
@@ -51,15 +51,6 @@ extern void prom_build_devicetree(void);
 extern void of_populate_present_mask(void);
 extern void of_fill_in_cpu_data(void);
 
-/* Dummy ref counting routines - to be implemented later */
-static inline struct device_node *of_node_get(struct device_node *node)
-{
-	return node;
-}
-static inline void of_node_put(struct device_node *node)
-{
-}
-
 /* These routines are here to provide compatibility with how powerpc
  * handles IRQ mapping for OF device nodes.  We precompute and permanently
  * register them in the of_device objects, whereas powerpc computes them
diff --git a/include/linux/of.h b/include/linux/of.h
index 18e4379b8b7f..4636bba93afa 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -79,6 +79,22 @@ set_node_proc_entry(struct device_node *dn, struct proc_dir_entry *de)
 	dn->pde = de;
 }
 
+#if defined(CONFIG_SPARC)
+/* Dummy ref counting routines - to be implemented later */
+static inline struct device_node *of_node_get(struct device_node *node)
+{
+	return node;
+}
+static inline void of_node_put(struct device_node *node)
+{
+}
+
+#else
+extern struct device_node *of_find_all_nodes(struct device_node *prev);
+extern struct device_node *of_node_get(struct device_node *node);
+extern void of_node_put(struct device_node *node);
+#endif
+
 /*
  * OF address retreival & translation
  */
-- 
cgit v1.2.3


From 8482f56803b9498af84bc09e7bc769a5924f6443 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Thu, 15 Oct 2009 10:58:04 -0600
Subject: of: merge of_*_flat_dt*() functions

Merge common flattened device tree code between Microblaze and PowerPC

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Wolfram Sang <w.sang@pengutronix.de>
Acked-by: Michal Simek <monstr@monstr.eu>
Acked-by: Stephen Neuendorffer <stephen.neuendorffer@xilinx.com>
Acked-by: Stephen Rothwell <sfr@canb.auug.org.au>
---
 arch/microblaze/include/asm/prom.h | 11 -----------
 arch/powerpc/include/asm/prom.h    | 10 ----------
 include/linux/of_fdt.h             | 14 ++++++++++++++
 3 files changed, 14 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index c92b4a9e4397..be4db2a22245 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -40,17 +40,6 @@ extern struct device_node *of_chosen;
 extern struct device_node *allnodes;	/* temporary while merging */
 extern rwlock_t devtree_lock;	/* temporary while merging */
 
-/* For scanning the flat device-tree at boot time */
-extern int __init of_scan_flat_dt(int (*it)(unsigned long node,
-					const char *uname, int depth,
-					void *data),
-				void *data);
-extern void *__init of_get_flat_dt_prop(unsigned long node, const char *name,
-					unsigned long *size);
-extern int __init
-		of_flat_dt_is_compatible(unsigned long node, const char *name);
-extern unsigned long __init of_get_flat_dt_root(void);
-
 /* For updating the device tree at runtime */
 extern void of_attach_node(struct device_node *);
 extern void of_detach_node(struct device_node *);
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 622769cd1d62..c8b59330b04a 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -34,16 +34,6 @@ extern struct device_node *of_chosen;
 
 #define HAVE_ARCH_DEVTREE_FIXUPS
 
-/* For scanning the flat device-tree at boot time */
-extern int __init of_scan_flat_dt(int (*it)(unsigned long node,
-					    const char *uname, int depth,
-					    void *data),
-				  void *data);
-extern void* __init of_get_flat_dt_prop(unsigned long node, const char *name,
-					unsigned long *size);
-extern int __init of_flat_dt_is_compatible(unsigned long node, const char *name);
-extern unsigned long __init of_get_flat_dt_root(void);
-
 /* For updating the device tree at runtime */
 extern void of_attach_node(struct device_node *);
 extern void of_detach_node(struct device_node *);
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index b37ad3a973b9..b363eadea819 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -12,6 +12,9 @@
 #ifndef _LINUX_OF_FDT_H
 #define _LINUX_OF_FDT_H
 
+#include <linux/types.h>
+#include <linux/init.h>
+
 /* Definitions used by the flattened device tree */
 #define OF_DT_HEADER		0xd00dfeed	/* marker */
 #define OF_DT_BEGIN_NODE	0x1		/* Start of node, full name */
@@ -54,5 +57,16 @@ struct boot_param_header {
 	u32	dt_struct_size;		/* size of the DT structure block */
 };
 
+/* For scanning the flat device-tree at boot time */
+extern int __init of_scan_flat_dt(int (*it)(unsigned long node,
+					    const char *uname, int depth,
+					    void *data),
+				  void *data);
+extern void __init *of_get_flat_dt_prop(unsigned long node, const char *name,
+					unsigned long *size);
+extern int __init of_flat_dt_is_compatible(unsigned long node,
+					   const char *name);
+extern unsigned long __init of_get_flat_dt_root(void);
+
 #endif /* __ASSEMBLY__ */
 #endif /* _LINUX_OF_FDT_H */
-- 
cgit v1.2.3


From 82b2928c95d824afd9af3bb41660f3c3fa1f234e Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Thu, 15 Oct 2009 10:58:07 -0600
Subject: of: merge other miscellaneous prototypes

Merge common prototypes used by Microblaze and PowerPC

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Wolfram Sang <w.sang@pengutronix.de>
Acked-by: Michal Simek <monstr@monstr.eu>
Acked-by: Stephen Neuendorffer <stephen.neuendorffer@xilinx.com>
Acked-by: Stephen Rothwell <sfr@canb.auug.org.au>
---
 arch/microblaze/include/asm/prom.h | 12 ------------
 arch/powerpc/include/asm/prom.h    | 14 --------------
 include/linux/of_fdt.h             | 14 ++++++++++++++
 3 files changed, 14 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/prom.h b/arch/microblaze/include/asm/prom.h
index be4db2a22245..ef3ec1d6ceb3 100644
--- a/arch/microblaze/include/asm/prom.h
+++ b/arch/microblaze/include/asm/prom.h
@@ -45,19 +45,7 @@ extern void of_attach_node(struct device_node *);
 extern void of_detach_node(struct device_node *);
 
 /* Other Prototypes */
-extern void finish_device_tree(void);
-extern void unflatten_device_tree(void);
 extern int early_uartlite_console(void);
-extern void early_init_devtree(void *);
-extern int machine_is_compatible(const char *compat);
-extern void print_properties(struct device_node *node);
-extern int prom_n_intr_cells(struct device_node *np);
-extern void prom_get_irq_senses(unsigned char *senses, int off, int max);
-extern int prom_add_property(struct device_node *np, struct property *prop);
-extern int prom_remove_property(struct device_node *np, struct property *prop);
-extern int prom_update_property(struct device_node *np,
-				struct property *newprop,
-				struct property *oldprop);
 
 extern struct resource *request_OF_resource(struct device_node *node,
 				int index, const char *name_postfix);
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index c8b59330b04a..2ab9cbd98826 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -38,20 +38,6 @@ extern struct device_node *of_chosen;
 extern void of_attach_node(struct device_node *);
 extern void of_detach_node(struct device_node *);
 
-/* Other Prototypes */
-extern void finish_device_tree(void);
-extern void unflatten_device_tree(void);
-extern void early_init_devtree(void *);
-extern int machine_is_compatible(const char *compat);
-extern void print_properties(struct device_node *node);
-extern int prom_n_intr_cells(struct device_node* np);
-extern void prom_get_irq_senses(unsigned char *senses, int off, int max);
-extern int prom_add_property(struct device_node* np, struct property* prop);
-extern int prom_remove_property(struct device_node *np, struct property *prop);
-extern int prom_update_property(struct device_node *np,
-				struct property *newprop,
-				struct property *oldprop);
-
 #ifdef CONFIG_PPC32
 /*
  * PCI <-> OF matching functions
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index b363eadea819..41d432b13553 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -68,5 +68,19 @@ extern int __init of_flat_dt_is_compatible(unsigned long node,
 					   const char *name);
 extern unsigned long __init of_get_flat_dt_root(void);
 
+/* Other Prototypes */
+extern void finish_device_tree(void);
+extern void unflatten_device_tree(void);
+extern void early_init_devtree(void *);
+extern int machine_is_compatible(const char *compat);
+extern void print_properties(struct device_node *node);
+extern int prom_n_intr_cells(struct device_node* np);
+extern void prom_get_irq_senses(unsigned char *senses, int off, int max);
+extern int prom_add_property(struct device_node* np, struct property* prop);
+extern int prom_remove_property(struct device_node *np, struct property *prop);
+extern int prom_update_property(struct device_node *np,
+				struct property *newprop,
+				struct property *oldprop);
+
 #endif /* __ASSEMBLY__ */
 #endif /* _LINUX_OF_FDT_H */
-- 
cgit v1.2.3


From e91edcf5a2940bb7f1f316c871dfe9e2aaf9d6d9 Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@secretlab.ca>
Date: Thu, 15 Oct 2009 10:58:09 -0600
Subject: of: merge of_find_all_nodes() implementations

Merge common code between Microblaze and PowerPC, and make it available
to Sparc

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Wolfram Sang <w.sang@pengutronix.de>
Acked-by: Michal Simek <monstr@monstr.eu>
Acked-by: Stephen Neuendorffer <stephen.neuendorffer@xilinx.com>
Acked-by: Stephen Rothwell <sfr@canb.auug.org.au>
---
 arch/microblaze/kernel/prom.c | 23 -----------------------
 arch/powerpc/kernel/prom.c    | 23 -----------------------
 drivers/of/base.c             | 26 +++++++++++++++++++++++++-
 include/linux/of.h            |  3 ++-
 4 files changed, 27 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/kernel/prom.c b/arch/microblaze/kernel/prom.c
index c005cc6f1aaf..b817df172aa9 100644
--- a/arch/microblaze/kernel/prom.c
+++ b/arch/microblaze/kernel/prom.c
@@ -859,29 +859,6 @@ struct device_node *of_find_node_by_phandle(phandle handle)
 }
 EXPORT_SYMBOL(of_find_node_by_phandle);
 
-/**
- *	of_find_all_nodes - Get next node in global list
- *	@prev:	Previous node or NULL to start iteration
- *		of_node_put() will be called on it
- *
- *	Returns a node pointer with refcount incremented, use
- *	of_node_put() on it when done.
- */
-struct device_node *of_find_all_nodes(struct device_node *prev)
-{
-	struct device_node *np;
-
-	read_lock(&devtree_lock);
-	np = prev ? prev->allnext : allnodes;
-	for (; np != NULL; np = np->allnext)
-		if (of_node_get(np))
-			break;
-	of_node_put(prev);
-	read_unlock(&devtree_lock);
-	return np;
-}
-EXPORT_SYMBOL(of_find_all_nodes);
-
 /**
  *	of_node_get - Increment refcount of a node
  *	@node:	Node to inc refcount, NULL is supported to
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index d4405b95bfaa..4ec300862466 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -1316,29 +1316,6 @@ struct device_node *of_find_next_cache_node(struct device_node *np)
 	return NULL;
 }
 
-/**
- *	of_find_all_nodes - Get next node in global list
- *	@prev:	Previous node or NULL to start iteration
- *		of_node_put() will be called on it
- *
- *	Returns a node pointer with refcount incremented, use
- *	of_node_put() on it when done.
- */
-struct device_node *of_find_all_nodes(struct device_node *prev)
-{
-	struct device_node *np;
-
-	read_lock(&devtree_lock);
-	np = prev ? prev->allnext : allnodes;
-	for (; np != 0; np = np->allnext)
-		if (of_node_get(np))
-			break;
-	of_node_put(prev);
-	read_unlock(&devtree_lock);
-	return np;
-}
-EXPORT_SYMBOL(of_find_all_nodes);
-
 /**
  *	of_node_get - Increment refcount of a node
  *	@node:	Node to inc refcount, NULL is supported to
diff --git a/drivers/of/base.c b/drivers/of/base.c
index ddf224d456b2..e6627b2320f1 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -9,7 +9,8 @@
  *
  *  Adapted for sparc and sparc64 by David S. Miller davem@davemloft.net
  *
- *  Reconsolidated from arch/x/kernel/prom.c by Stephen Rothwell.
+ *  Reconsolidated from arch/x/kernel/prom.c by Stephen Rothwell and
+ *  Grant Likely.
  *
  *      This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -82,6 +83,29 @@ struct property *of_find_property(const struct device_node *np,
 }
 EXPORT_SYMBOL(of_find_property);
 
+/**
+ * of_find_all_nodes - Get next node in global list
+ * @prev:	Previous node or NULL to start iteration
+ *		of_node_put() will be called on it
+ *
+ * Returns a node pointer with refcount incremented, use
+ * of_node_put() on it when done.
+ */
+struct device_node *of_find_all_nodes(struct device_node *prev)
+{
+	struct device_node *np;
+
+	read_lock(&devtree_lock);
+	np = prev ? prev->allnext : allnodes;
+	for (; np != NULL; np = np->allnext)
+		if (of_node_get(np))
+			break;
+	of_node_put(prev);
+	read_unlock(&devtree_lock);
+	return np;
+}
+EXPORT_SYMBOL(of_find_all_nodes);
+
 /*
  * Find a property with a given name for a given node
  * and return the value.
diff --git a/include/linux/of.h b/include/linux/of.h
index 4636bba93afa..e7facd8fbce8 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -79,6 +79,8 @@ set_node_proc_entry(struct device_node *dn, struct proc_dir_entry *de)
 	dn->pde = de;
 }
 
+extern struct device_node *of_find_all_nodes(struct device_node *prev);
+
 #if defined(CONFIG_SPARC)
 /* Dummy ref counting routines - to be implemented later */
 static inline struct device_node *of_node_get(struct device_node *node)
@@ -90,7 +92,6 @@ static inline void of_node_put(struct device_node *node)
 }
 
 #else
-extern struct device_node *of_find_all_nodes(struct device_node *prev);
 extern struct device_node *of_node_get(struct device_node *node);
 extern void of_node_put(struct device_node *node);
 #endif
-- 
cgit v1.2.3


From c720c7e8383aff1cb219bddf474ed89d850336e3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 15 Oct 2009 06:30:45 +0000
Subject: inet: rename some inet_sock fields

In order to have better cache layouts of struct sock (separate zones
for rx/tx paths), we need this preliminary patch.

Goal is to transfert fields used at lookup time in the first
read-mostly cache line (inside struct sock_common) and move sk_refcnt
to a separate cache line (only written by rx path)

This patch adds inet_ prefix to daddr, rcv_saddr, dport, num, saddr,
sport and id fields. This allows a future patch to define these
fields as macros, like sk_refcnt, without name clashes.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/pppol2tp.c                         | 22 ++++----
 fs/ocfs2/cluster/netdebug.c                    |  8 +--
 include/linux/ipv6.h                           |  2 +-
 include/net/inet6_hashtables.h                 |  4 +-
 include/net/inet_hashtables.h                  | 12 ++---
 include/net/inet_sock.h                        | 36 ++++++-------
 include/net/inet_timewait_sock.h               |  2 +-
 include/net/ip.h                               | 12 ++---
 net/dccp/ipv4.c                                | 40 ++++++++-------
 net/dccp/ipv6.c                                | 27 +++++-----
 net/dccp/output.c                              |  4 +-
 net/dccp/probe.c                               | 13 ++---
 net/dccp/proto.c                               |  4 +-
 net/ipv4/af_inet.c                             | 62 +++++++++++------------
 net/ipv4/datagram.c                            | 18 +++----
 net/ipv4/inet_connection_sock.c                | 20 ++++----
 net/ipv4/inet_diag.c                           | 26 +++++-----
 net/ipv4/inet_hashtables.c                     | 34 +++++++------
 net/ipv4/inet_timewait_sock.c                  | 12 ++---
 net/ipv4/ip_input.c                            |  2 +-
 net/ipv4/ip_output.c                           | 15 +++---
 net/ipv4/ip_sockglue.c                         |  8 +--
 net/ipv4/ipmr.c                                |  2 +-
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c |  8 +--
 net/ipv4/raw.c                                 | 30 +++++------
 net/ipv4/tcp.c                                 |  4 +-
 net/ipv4/tcp_ipv4.c                            | 70 +++++++++++++-------------
 net/ipv4/tcp_output.c                          |  4 +-
 net/ipv4/tcp_probe.c                           | 11 ++--
 net/ipv4/tcp_timer.c                           |  8 +--
 net/ipv4/udp.c                                 | 51 ++++++++++---------
 net/ipv6/af_inet6.c                            | 28 +++++------
 net/ipv6/datagram.c                            | 15 +++---
 net/ipv6/inet6_connection_sock.c               |  6 +--
 net/ipv6/inet6_hashtables.c                    | 12 ++---
 net/ipv6/ip6mr.c                               |  2 +-
 net/ipv6/ipv6_sockglue.c                       |  6 +--
 net/ipv6/raw.c                                 | 30 +++++------
 net/ipv6/syncookies.c                          |  2 +-
 net/ipv6/tcp_ipv6.c                            | 32 ++++++------
 net/ipv6/udp.c                                 | 24 ++++-----
 net/netfilter/xt_socket.c                      |  2 +-
 net/rds/tcp_listen.c                           |  8 +--
 net/sctp/protocol.c                            |  8 +--
 net/sctp/socket.c                              | 30 +++++------
 net/sunrpc/svcsock.c                           |  8 +--
 security/lsm_audit.c                           | 12 ++---
 47 files changed, 408 insertions(+), 388 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/pppol2tp.c b/drivers/net/pppol2tp.c
index 5910df60c93e..849cc9c62c2a 100644
--- a/drivers/net/pppol2tp.c
+++ b/drivers/net/pppol2tp.c
@@ -516,7 +516,7 @@ static inline int pppol2tp_verify_udp_checksum(struct sock *sk,
 		return 0;
 
 	inet = inet_sk(sk);
-	psum = csum_tcpudp_nofold(inet->saddr, inet->daddr, ulen,
+	psum = csum_tcpudp_nofold(inet->inet_saddr, inet->inet_daddr, ulen,
 				  IPPROTO_UDP, 0);
 
 	if ((skb->ip_summed == CHECKSUM_COMPLETE) &&
@@ -949,8 +949,8 @@ static int pppol2tp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msgh
 	inet = inet_sk(sk_tun);
 	udp_len = hdr_len + sizeof(ppph) + total_len;
 	uh = (struct udphdr *) skb->data;
-	uh->source = inet->sport;
-	uh->dest = inet->dport;
+	uh->source = inet->inet_sport;
+	uh->dest = inet->inet_dport;
 	uh->len = htons(udp_len);
 	uh->check = 0;
 	skb_put(skb, sizeof(struct udphdr));
@@ -978,7 +978,8 @@ static int pppol2tp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msgh
 	else if (!(skb_dst(skb)->dev->features & NETIF_F_V4_CSUM)) {
 		skb->ip_summed = CHECKSUM_COMPLETE;
 		csum = skb_checksum(skb, 0, udp_len, 0);
-		uh->check = csum_tcpudp_magic(inet->saddr, inet->daddr,
+		uh->check = csum_tcpudp_magic(inet->inet_saddr,
+					      inet->inet_daddr,
 					      udp_len, IPPROTO_UDP, csum);
 		if (uh->check == 0)
 			uh->check = CSUM_MANGLED_0;
@@ -986,7 +987,8 @@ static int pppol2tp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msgh
 		skb->ip_summed = CHECKSUM_PARTIAL;
 		skb->csum_start = skb_transport_header(skb) - skb->head;
 		skb->csum_offset = offsetof(struct udphdr, check);
-		uh->check = ~csum_tcpudp_magic(inet->saddr, inet->daddr,
+		uh->check = ~csum_tcpudp_magic(inet->inet_saddr,
+					       inet->inet_daddr,
 					       udp_len, IPPROTO_UDP, 0);
 	}
 
@@ -1136,8 +1138,8 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb)
 	__skb_push(skb, sizeof(*uh));
 	skb_reset_transport_header(skb);
 	uh = udp_hdr(skb);
-	uh->source = inet->sport;
-	uh->dest = inet->dport;
+	uh->source = inet->inet_sport;
+	uh->dest = inet->inet_dport;
 	uh->len = htons(udp_len);
 	uh->check = 0;
 
@@ -1181,7 +1183,8 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb)
 	else if (!(skb_dst(skb)->dev->features & NETIF_F_V4_CSUM)) {
 		skb->ip_summed = CHECKSUM_COMPLETE;
 		csum = skb_checksum(skb, 0, udp_len, 0);
-		uh->check = csum_tcpudp_magic(inet->saddr, inet->daddr,
+		uh->check = csum_tcpudp_magic(inet->inet_saddr,
+					      inet->inet_daddr,
 					      udp_len, IPPROTO_UDP, csum);
 		if (uh->check == 0)
 			uh->check = CSUM_MANGLED_0;
@@ -1189,7 +1192,8 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb)
 		skb->ip_summed = CHECKSUM_PARTIAL;
 		skb->csum_start = skb_transport_header(skb) - skb->head;
 		skb->csum_offset = offsetof(struct udphdr, check);
-		uh->check = ~csum_tcpudp_magic(inet->saddr, inet->daddr,
+		uh->check = ~csum_tcpudp_magic(inet->inet_saddr,
+					       inet->inet_daddr,
 					       udp_len, IPPROTO_UDP, 0);
 	}
 
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index da794bc07a6c..a3f150e52b02 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -294,10 +294,10 @@ static int sc_seq_show(struct seq_file *seq, void *v)
 		if (sc->sc_sock) {
 			inet = inet_sk(sc->sc_sock->sk);
 			/* the stack's structs aren't sparse endian clean */
-			saddr = (__force __be32)inet->saddr;
-			daddr = (__force __be32)inet->daddr;
-			sport = (__force __be16)inet->sport;
-			dport = (__force __be16)inet->dport;
+			saddr = (__force __be32)inet->inet_saddr;
+			daddr = (__force __be32)inet->inet_daddr;
+			sport = (__force __be16)inet->inet_sport;
+			dport = (__force __be16)inet->inet_dport;
 		}
 
 		/* XXX sigh, inet-> doesn't have sparse annotation so any
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 56404251248c..e0cc9a7db2b5 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -505,7 +505,7 @@ static inline struct raw6_sock *raw6_sk(const struct sock *sk)
 
 #define INET6_MATCH(__sk, __net, __hash, __saddr, __daddr, __ports, __dif)\
 	(((__sk)->sk_hash == (__hash)) && sock_net((__sk)) == (__net)	&& \
-	 ((*((__portpair *)&(inet_sk(__sk)->dport))) == (__ports))  	&& \
+	 ((*((__portpair *)&(inet_sk(__sk)->inet_dport))) == (__ports)) && \
 	 ((__sk)->sk_family		== AF_INET6)		&& \
 	 ipv6_addr_equal(&inet6_sk(__sk)->daddr, (__saddr))	&& \
 	 ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr))	&& \
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 22c73a77cd99..92838d3a1ab7 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -46,8 +46,8 @@ static inline int inet6_sk_ehashfn(const struct sock *sk)
 	const struct ipv6_pinfo *np = inet6_sk(sk);
 	const struct in6_addr *laddr = &np->rcv_saddr;
 	const struct in6_addr *faddr = &np->daddr;
-	const __u16 lport = inet->num;
-	const __be16 fport = inet->dport;
+	const __u16 lport = inet->inet_num;
+	const __be16 fport = inet->inet_dport;
 	struct net *net = sock_net(sk);
 
 	return inet6_ehashfn(net, laddr, lport, faddr, fport);
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 5f11c4a0daca..5b698b3b463d 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -241,7 +241,7 @@ static inline int inet_lhashfn(struct net *net, const unsigned short num)
 
 static inline int inet_sk_listen_hashfn(const struct sock *sk)
 {
-	return inet_lhashfn(sock_net(sk), inet_sk(sk)->num);
+	return inet_lhashfn(sock_net(sk), inet_sk(sk)->inet_num);
 }
 
 /* Caller must disable local BH processing. */
@@ -301,8 +301,8 @@ typedef __u64 __bitwise __addrpair;
 #endif /* __BIG_ENDIAN */
 #define INET_MATCH(__sk, __net, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
 	(((__sk)->sk_hash == (__hash)) && net_eq(sock_net(__sk), (__net)) &&	\
-	 ((*((__addrpair *)&(inet_sk(__sk)->daddr))) == (__cookie))	&&	\
-	 ((*((__portpair *)&(inet_sk(__sk)->dport))) == (__ports))	&&	\
+	 ((*((__addrpair *)&(inet_sk(__sk)->inet_daddr))) == (__cookie))  &&	\
+	 ((*((__portpair *)&(inet_sk(__sk)->inet_dport))) == (__ports))   &&	\
 	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
 #define INET_TW_MATCH(__sk, __net, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
 	(((__sk)->sk_hash == (__hash)) && net_eq(sock_net(__sk), (__net)) &&	\
@@ -313,9 +313,9 @@ typedef __u64 __bitwise __addrpair;
 #define INET_ADDR_COOKIE(__name, __saddr, __daddr)
 #define INET_MATCH(__sk, __net, __hash, __cookie, __saddr, __daddr, __ports, __dif)	\
 	(((__sk)->sk_hash == (__hash)) && net_eq(sock_net(__sk), (__net))	&&	\
-	 (inet_sk(__sk)->daddr		== (__saddr))		&&	\
-	 (inet_sk(__sk)->rcv_saddr	== (__daddr))		&&	\
-	 ((*((__portpair *)&(inet_sk(__sk)->dport))) == (__ports))	&&	\
+	 (inet_sk(__sk)->inet_daddr	== (__saddr))		&&	\
+	 (inet_sk(__sk)->inet_rcv_saddr	== (__daddr))		&&	\
+	 ((*((__portpair *)&(inet_sk(__sk)->inet_dport))) == (__ports))	&&	\
 	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
 #define INET_TW_MATCH(__sk, __net, __hash,__cookie, __saddr, __daddr, __ports, __dif)	\
 	(((__sk)->sk_hash == (__hash)) && net_eq(sock_net(__sk), (__net))	&&	\
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 47004f35cc7e..bd4c53f75ac0 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -93,14 +93,14 @@ struct rtable;
  *
  * @sk - ancestor class
  * @pinet6 - pointer to IPv6 control block
- * @daddr - Foreign IPv4 addr
- * @rcv_saddr - Bound local IPv4 addr
- * @dport - Destination port
- * @num - Local port
- * @saddr - Sending source
+ * @inet_daddr - Foreign IPv4 addr
+ * @inet_rcv_saddr - Bound local IPv4 addr
+ * @inet_dport - Destination port
+ * @inet_num - Local port
+ * @inet_saddr - Sending source
  * @uc_ttl - Unicast TTL
- * @sport - Source port
- * @id - ID counter for DF pkts
+ * @inet_sport - Source port
+ * @inet_id - ID counter for DF pkts
  * @tos - TOS
  * @mc_ttl - Multicasting TTL
  * @is_icsk - is this an inet_connection_sock?
@@ -115,16 +115,16 @@ struct inet_sock {
 	struct ipv6_pinfo	*pinet6;
 #endif
 	/* Socket demultiplex comparisons on incoming packets. */
-	__be32			daddr;
-	__be32			rcv_saddr;
-	__be16			dport;
-	__u16			num;
-	__be32			saddr;
+	__be32			inet_daddr;
+	__be32			inet_rcv_saddr;
+	__be16			inet_dport;
+	__u16			inet_num;
+	__be32			inet_saddr;
 	__s16			uc_ttl;
 	__u16			cmsg_flags;
 	struct ip_options	*opt;
-	__be16			sport;
-	__u16			id;
+	__be16			inet_sport;
+	__u16			inet_id;
 	__u8			tos;
 	__u8			mc_ttl;
 	__u8			pmtudisc;
@@ -190,10 +190,10 @@ static inline unsigned int inet_ehashfn(struct net *net,
 static inline int inet_sk_ehashfn(const struct sock *sk)
 {
 	const struct inet_sock *inet = inet_sk(sk);
-	const __be32 laddr = inet->rcv_saddr;
-	const __u16 lport = inet->num;
-	const __be32 faddr = inet->daddr;
-	const __be16 fport = inet->dport;
+	const __be32 laddr = inet->inet_rcv_saddr;
+	const __u16 lport = inet->inet_num;
+	const __be32 faddr = inet->inet_daddr;
+	const __be16 fport = inet->inet_dport;
 	struct net *net = sock_net(sk);
 
 	return inet_ehashfn(net, laddr, lport, faddr, fport);
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index b63b80fac567..37f3aea074a5 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -194,7 +194,7 @@ static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
 static inline __be32 inet_rcv_saddr(const struct sock *sk)
 {
 	return likely(sk->sk_state != TCP_TIME_WAIT) ?
-		inet_sk(sk)->rcv_saddr : inet_twsk(sk)->tw_rcv_saddr;
+		inet_sk(sk)->inet_rcv_saddr : inet_twsk(sk)->tw_rcv_saddr;
 }
 
 extern void inet_twsk_put(struct inet_timewait_sock *tw);
diff --git a/include/net/ip.h b/include/net/ip.h
index 2f47e5482b55..376adf47764e 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -240,8 +240,8 @@ static inline void ip_select_ident(struct iphdr *iph, struct dst_entry *dst, str
 		 * does not change, they drop every other packet in
 		 * a TCP stream using header compression.
 		 */
-		iph->id = (sk && inet_sk(sk)->daddr) ?
-					htons(inet_sk(sk)->id++) : 0;
+		iph->id = (sk && inet_sk(sk)->inet_daddr) ?
+					htons(inet_sk(sk)->inet_id++) : 0;
 	} else
 		__ip_select_ident(iph, dst, 0);
 }
@@ -249,9 +249,9 @@ static inline void ip_select_ident(struct iphdr *iph, struct dst_entry *dst, str
 static inline void ip_select_ident_more(struct iphdr *iph, struct dst_entry *dst, struct sock *sk, int more)
 {
 	if (iph->frag_off & htons(IP_DF)) {
-		if (sk && inet_sk(sk)->daddr) {
-			iph->id = htons(inet_sk(sk)->id);
-			inet_sk(sk)->id += 1 + more;
+		if (sk && inet_sk(sk)->inet_daddr) {
+			iph->id = htons(inet_sk(sk)->inet_id);
+			inet_sk(sk)->inet_id += 1 + more;
 		} else
 			iph->id = 0;
 	} else
@@ -317,7 +317,7 @@ static inline void ip_ib_mc_map(__be32 naddr, const unsigned char *broadcast, ch
 
 static __inline__ void inet_reset_saddr(struct sock *sk)
 {
-	inet_sk(sk)->rcv_saddr = inet_sk(sk)->saddr = 0;
+	inet_sk(sk)->inet_rcv_saddr = inet_sk(sk)->inet_saddr = 0;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	if (sk->sk_family == PF_INET6) {
 		struct ipv6_pinfo *np = inet6_sk(sk);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 7302e1498d46..00028d4b09d9 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -62,10 +62,10 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		nexthop = inet->opt->faddr;
 	}
 
-	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
+	tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
 			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 			       IPPROTO_DCCP,
-			       inet->sport, usin->sin_port, sk, 1);
+			       inet->inet_sport, usin->sin_port, sk, 1);
 	if (tmp < 0)
 		return tmp;
 
@@ -77,12 +77,12 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	if (inet->opt == NULL || !inet->opt->srr)
 		daddr = rt->rt_dst;
 
-	if (inet->saddr == 0)
-		inet->saddr = rt->rt_src;
-	inet->rcv_saddr = inet->saddr;
+	if (inet->inet_saddr == 0)
+		inet->inet_saddr = rt->rt_src;
+	inet->inet_rcv_saddr = inet->inet_saddr;
 
-	inet->dport = usin->sin_port;
-	inet->daddr = daddr;
+	inet->inet_dport = usin->sin_port;
+	inet->inet_daddr = daddr;
 
 	inet_csk(sk)->icsk_ext_hdr_len = 0;
 	if (inet->opt != NULL)
@@ -98,17 +98,19 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	if (err != 0)
 		goto failure;
 
-	err = ip_route_newports(&rt, IPPROTO_DCCP, inet->sport, inet->dport,
-				sk);
+	err = ip_route_newports(&rt, IPPROTO_DCCP, inet->inet_sport,
+				inet->inet_dport, sk);
 	if (err != 0)
 		goto failure;
 
 	/* OK, now commit destination to socket.  */
 	sk_setup_caps(sk, &rt->u.dst);
 
-	dp->dccps_iss = secure_dccp_sequence_number(inet->saddr, inet->daddr,
-						    inet->sport, inet->dport);
-	inet->id = dp->dccps_iss ^ jiffies;
+	dp->dccps_iss = secure_dccp_sequence_number(inet->inet_saddr,
+						    inet->inet_daddr,
+						    inet->inet_sport,
+						    inet->inet_dport);
+	inet->inet_id = dp->dccps_iss ^ jiffies;
 
 	err = dccp_connect(sk);
 	rt = NULL;
@@ -123,7 +125,7 @@ failure:
 	dccp_set_state(sk, DCCP_CLOSED);
 	ip_rt_put(rt);
 	sk->sk_route_caps = 0;
-	inet->dport = 0;
+	inet->inet_dport = 0;
 	goto out;
 }
 
@@ -352,7 +354,9 @@ void dccp_v4_send_check(struct sock *sk, int unused, struct sk_buff *skb)
 	struct dccp_hdr *dh = dccp_hdr(skb);
 
 	dccp_csum_outgoing(skb);
-	dh->dccph_checksum = dccp_v4_csum_finish(skb, inet->saddr, inet->daddr);
+	dh->dccph_checksum = dccp_v4_csum_finish(skb,
+						 inet->inet_saddr,
+						 inet->inet_daddr);
 }
 
 EXPORT_SYMBOL_GPL(dccp_v4_send_check);
@@ -393,14 +397,14 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 	newinet		   = inet_sk(newsk);
 	ireq		   = inet_rsk(req);
-	newinet->daddr	   = ireq->rmt_addr;
-	newinet->rcv_saddr = ireq->loc_addr;
-	newinet->saddr	   = ireq->loc_addr;
+	newinet->inet_daddr	= ireq->rmt_addr;
+	newinet->inet_rcv_saddr = ireq->loc_addr;
+	newinet->inet_saddr	= ireq->loc_addr;
 	newinet->opt	   = ireq->opt;
 	ireq->opt	   = NULL;
 	newinet->mc_index  = inet_iif(skb);
 	newinet->mc_ttl	   = ip_hdr(skb)->ttl;
-	newinet->id	   = jiffies;
+	newinet->inet_id   = jiffies;
 
 	dccp_sync_mss(newsk, dst_mtu(dst));
 
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index a2afb553d8b3..6d89f9f7d5d8 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -158,8 +158,8 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
 			ipv6_addr_copy(&fl.fl6_src, &np->saddr);
 			fl.oif = sk->sk_bound_dev_if;
-			fl.fl_ip_dport = inet->dport;
-			fl.fl_ip_sport = inet->sport;
+			fl.fl_ip_dport = inet->inet_dport;
+			fl.fl_ip_sport = inet->inet_sport;
 			security_sk_classify_flow(sk, &fl);
 
 			err = ip6_dst_lookup(sk, &dst, &fl);
@@ -510,9 +510,9 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 
 		memcpy(newnp, np, sizeof(struct ipv6_pinfo));
 
-		ipv6_addr_set_v4mapped(newinet->daddr, &newnp->daddr);
+		ipv6_addr_set_v4mapped(newinet->inet_daddr, &newnp->daddr);
 
-		ipv6_addr_set_v4mapped(newinet->saddr, &newnp->saddr);
+		ipv6_addr_set_v4mapped(newinet->inet_saddr, &newnp->saddr);
 
 		ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr);
 
@@ -640,7 +640,8 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 
 	dccp_sync_mss(newsk, dst_mtu(dst));
 
-	newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6;
+	newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
+	newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
 
 	__inet6_hash(newsk);
 	__inet_inherit_port(sk, newsk);
@@ -968,10 +969,9 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 			icsk->icsk_af_ops = &dccp_ipv6_af_ops;
 			sk->sk_backlog_rcv = dccp_v6_do_rcv;
 			goto failure;
-		} else {
-			ipv6_addr_set_v4mapped(inet->saddr, &np->saddr);
-			ipv6_addr_set_v4mapped(inet->rcv_saddr, &np->rcv_saddr);
 		}
+		ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr);
+		ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, &np->rcv_saddr);
 
 		return err;
 	}
@@ -984,7 +984,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	ipv6_addr_copy(&fl.fl6_src, saddr ? saddr : &np->saddr);
 	fl.oif = sk->sk_bound_dev_if;
 	fl.fl_ip_dport = usin->sin6_port;
-	fl.fl_ip_sport = inet->sport;
+	fl.fl_ip_sport = inet->inet_sport;
 	security_sk_classify_flow(sk, &fl);
 
 	if (np->opt != NULL && np->opt->srcrt != NULL) {
@@ -1017,7 +1017,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 
 	/* set the source address */
 	ipv6_addr_copy(&np->saddr, saddr);
-	inet->rcv_saddr = LOOPBACK4_IPV6;
+	inet->inet_rcv_saddr = LOOPBACK4_IPV6;
 
 	__ip6_dst_store(sk, dst, NULL, NULL);
 
@@ -1026,7 +1026,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
 					  np->opt->opt_nflen);
 
-	inet->dport = usin->sin6_port;
+	inet->inet_dport = usin->sin6_port;
 
 	dccp_set_state(sk, DCCP_REQUESTING);
 	err = inet6_hash_connect(&dccp_death_row, sk);
@@ -1035,7 +1035,8 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 
 	dp->dccps_iss = secure_dccpv6_sequence_number(np->saddr.s6_addr32,
 						      np->daddr.s6_addr32,
-						      inet->sport, inet->dport);
+						      inet->inet_sport,
+						      inet->inet_dport);
 	err = dccp_connect(sk);
 	if (err)
 		goto late_failure;
@@ -1046,7 +1047,7 @@ late_failure:
 	dccp_set_state(sk, DCCP_CLOSED);
 	__sk_dst_reset(sk);
 failure:
-	inet->dport = 0;
+	inet->inet_dport = 0;
 	sk->sk_route_caps = 0;
 	return err;
 }
diff --git a/net/dccp/output.c b/net/dccp/output.c
index c96119fda688..d6bb753bf6ad 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -99,8 +99,8 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 		/* Build DCCP header and checksum it. */
 		dh = dccp_zeroed_hdr(skb, dccp_header_size);
 		dh->dccph_type	= dcb->dccpd_type;
-		dh->dccph_sport	= inet->sport;
-		dh->dccph_dport	= inet->dport;
+		dh->dccph_sport	= inet->inet_sport;
+		dh->dccph_dport	= inet->inet_dport;
 		dh->dccph_doff	= (dccp_header_size + dcb->dccpd_opt_len) / 4;
 		dh->dccph_ccval	= dcb->dccpd_ccval;
 		dh->dccph_cscov = dp->dccps_pcslen;
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
index 5e6ec8b9b7b6..dc328425fa20 100644
--- a/net/dccp/probe.c
+++ b/net/dccp/probe.c
@@ -80,19 +80,20 @@ static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk,
 	if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3)
 		hc = ccid3_hc_tx_sk(sk);
 
-	if (port == 0 || ntohs(inet->dport) == port ||
-	    ntohs(inet->sport) == port) {
+	if (port == 0 || ntohs(inet->inet_dport) == port ||
+	    ntohs(inet->inet_sport) == port) {
 		if (hc)
 			printl("%pI4:%u %pI4:%u %d %d %d %d %u %llu %llu %d\n",
-			       &inet->saddr, ntohs(inet->sport),
-			       &inet->daddr, ntohs(inet->dport), size,
+			       &inet->inet_saddr, ntohs(inet->inet_sport),
+			       &inet->inet_daddr, ntohs(inet->inet_dport), size,
 			       hc->tx_s, hc->tx_rtt, hc->tx_p,
 			       hc->tx_x_calc, hc->tx_x_recv >> 6,
 			       hc->tx_x >> 6, hc->tx_t_ipi);
 		else
 			printl("%pI4:%u %pI4:%u %d\n",
-			       &inet->saddr, ntohs(inet->sport),
-			       &inet->daddr, ntohs(inet->dport), size);
+			       &inet->inet_saddr, ntohs(inet->inet_sport),
+			       &inet->inet_daddr, ntohs(inet->inet_dport),
+			       size);
 	}
 
 	jprobe_return();
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index ecb203fff501..671cd1413d59 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -278,7 +278,7 @@ int dccp_disconnect(struct sock *sk, int flags)
 		sk->sk_send_head = NULL;
 	}
 
-	inet->dport = 0;
+	inet->inet_dport = 0;
 
 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
 		inet_reset_saddr(sk);
@@ -290,7 +290,7 @@ int dccp_disconnect(struct sock *sk, int flags)
 	inet_csk_delack_init(sk);
 	__sk_dst_reset(sk);
 
-	WARN_ON(inet->num && !icsk->icsk_bind_hash);
+	WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
 
 	sk->sk_error_report(sk);
 	return err;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 1deff48b122e..04a14b1600ac 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -174,12 +174,12 @@ static int inet_autobind(struct sock *sk)
 	/* We may need to bind the socket. */
 	lock_sock(sk);
 	inet = inet_sk(sk);
-	if (!inet->num) {
+	if (!inet->inet_num) {
 		if (sk->sk_prot->get_port(sk, 0)) {
 			release_sock(sk);
 			return -EAGAIN;
 		}
-		inet->sport = htons(inet->num);
+		inet->inet_sport = htons(inet->inet_num);
 	}
 	release_sock(sk);
 	return 0;
@@ -354,7 +354,7 @@ lookup_protocol:
 	inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
 
 	if (SOCK_RAW == sock->type) {
-		inet->num = protocol;
+		inet->inet_num = protocol;
 		if (IPPROTO_RAW == protocol)
 			inet->hdrincl = 1;
 	}
@@ -364,7 +364,7 @@ lookup_protocol:
 	else
 		inet->pmtudisc = IP_PMTUDISC_WANT;
 
-	inet->id = 0;
+	inet->inet_id = 0;
 
 	sock_init_data(sock, sk);
 
@@ -381,13 +381,13 @@ lookup_protocol:
 
 	sk_refcnt_debug_inc(sk);
 
-	if (inet->num) {
+	if (inet->inet_num) {
 		/* It assumes that any protocol which allows
 		 * the user to assign a number at socket
 		 * creation time automatically
 		 * shares.
 		 */
-		inet->sport = htons(inet->num);
+		inet->inet_sport = htons(inet->inet_num);
 		/* Add to protocol hash chains. */
 		sk->sk_prot->hash(sk);
 	}
@@ -494,27 +494,27 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 
 	/* Check these errors (active socket, double bind). */
 	err = -EINVAL;
-	if (sk->sk_state != TCP_CLOSE || inet->num)
+	if (sk->sk_state != TCP_CLOSE || inet->inet_num)
 		goto out_release_sock;
 
-	inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;
+	inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
 	if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
-		inet->saddr = 0;  /* Use device */
+		inet->inet_saddr = 0;  /* Use device */
 
 	/* Make sure we are allowed to bind here. */
 	if (sk->sk_prot->get_port(sk, snum)) {
-		inet->saddr = inet->rcv_saddr = 0;
+		inet->inet_saddr = inet->inet_rcv_saddr = 0;
 		err = -EADDRINUSE;
 		goto out_release_sock;
 	}
 
-	if (inet->rcv_saddr)
+	if (inet->inet_rcv_saddr)
 		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
 	if (snum)
 		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
-	inet->sport = htons(inet->num);
-	inet->daddr = 0;
-	inet->dport = 0;
+	inet->inet_sport = htons(inet->inet_num);
+	inet->inet_daddr = 0;
+	inet->inet_dport = 0;
 	sk_dst_reset(sk);
 	err = 0;
 out_release_sock:
@@ -532,7 +532,7 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
 	if (uaddr->sa_family == AF_UNSPEC)
 		return sk->sk_prot->disconnect(sk, flags);
 
-	if (!inet_sk(sk)->num && inet_autobind(sk))
+	if (!inet_sk(sk)->inet_num && inet_autobind(sk))
 		return -EAGAIN;
 	return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
 }
@@ -689,17 +689,17 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 
 	sin->sin_family = AF_INET;
 	if (peer) {
-		if (!inet->dport ||
+		if (!inet->inet_dport ||
 		    (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
 		     peer == 1))
 			return -ENOTCONN;
-		sin->sin_port = inet->dport;
-		sin->sin_addr.s_addr = inet->daddr;
+		sin->sin_port = inet->inet_dport;
+		sin->sin_addr.s_addr = inet->inet_daddr;
 	} else {
-		__be32 addr = inet->rcv_saddr;
+		__be32 addr = inet->inet_rcv_saddr;
 		if (!addr)
-			addr = inet->saddr;
-		sin->sin_port = inet->sport;
+			addr = inet->inet_saddr;
+		sin->sin_port = inet->inet_sport;
 		sin->sin_addr.s_addr = addr;
 	}
 	memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
@@ -714,7 +714,7 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 	struct sock *sk = sock->sk;
 
 	/* We may need to bind the socket. */
-	if (!inet_sk(sk)->num && inet_autobind(sk))
+	if (!inet_sk(sk)->inet_num && inet_autobind(sk))
 		return -EAGAIN;
 
 	return sk->sk_prot->sendmsg(iocb, sk, msg, size);
@@ -728,7 +728,7 @@ static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
 	struct sock *sk = sock->sk;
 
 	/* We may need to bind the socket. */
-	if (!inet_sk(sk)->num && inet_autobind(sk))
+	if (!inet_sk(sk)->inet_num && inet_autobind(sk))
 		return -EAGAIN;
 
 	if (sk->sk_prot->sendpage)
@@ -1059,9 +1059,9 @@ static int inet_sk_reselect_saddr(struct sock *sk)
 	struct inet_sock *inet = inet_sk(sk);
 	int err;
 	struct rtable *rt;
-	__be32 old_saddr = inet->saddr;
+	__be32 old_saddr = inet->inet_saddr;
 	__be32 new_saddr;
-	__be32 daddr = inet->daddr;
+	__be32 daddr = inet->inet_daddr;
 
 	if (inet->opt && inet->opt->srr)
 		daddr = inet->opt->faddr;
@@ -1071,7 +1071,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
 			       RT_CONN_FLAGS(sk),
 			       sk->sk_bound_dev_if,
 			       sk->sk_protocol,
-			       inet->sport, inet->dport, sk, 0);
+			       inet->inet_sport, inet->inet_dport, sk, 0);
 	if (err)
 		return err;
 
@@ -1087,7 +1087,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
 		       __func__, &old_saddr, &new_saddr);
 	}
 
-	inet->saddr = inet->rcv_saddr = new_saddr;
+	inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
 
 	/*
 	 * XXX The only one ugly spot where we need to
@@ -1113,7 +1113,7 @@ int inet_sk_rebuild_header(struct sock *sk)
 		return 0;
 
 	/* Reroute. */
-	daddr = inet->daddr;
+	daddr = inet->inet_daddr;
 	if (inet->opt && inet->opt->srr)
 		daddr = inet->opt->faddr;
 {
@@ -1123,7 +1123,7 @@ int inet_sk_rebuild_header(struct sock *sk)
 		.nl_u = {
 			.ip4_u = {
 				.daddr	= daddr,
-				.saddr	= inet->saddr,
+				.saddr	= inet->inet_saddr,
 				.tos	= RT_CONN_FLAGS(sk),
 			},
 		},
@@ -1131,8 +1131,8 @@ int inet_sk_rebuild_header(struct sock *sk)
 		.flags = inet_sk_flowi_flags(sk),
 		.uli_u = {
 			.ports = {
-				.sport = inet->sport,
-				.dport = inet->dport,
+				.sport = inet->inet_sport,
+				.dport = inet->inet_dport,
 			},
 		},
 	};
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 5e6c5a0f3fde..fb2465811b48 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -39,7 +39,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	sk_dst_reset(sk);
 
 	oif = sk->sk_bound_dev_if;
-	saddr = inet->saddr;
+	saddr = inet->inet_saddr;
 	if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
 		if (!oif)
 			oif = inet->mc_index;
@@ -49,7 +49,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr,
 			       RT_CONN_FLAGS(sk), oif,
 			       sk->sk_protocol,
-			       inet->sport, usin->sin_port, sk, 1);
+			       inet->inet_sport, usin->sin_port, sk, 1);
 	if (err) {
 		if (err == -ENETUNREACH)
 			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
@@ -60,14 +60,14 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		ip_rt_put(rt);
 		return -EACCES;
 	}
-	if (!inet->saddr)
-		inet->saddr = rt->rt_src;	/* Update source address */
-	if (!inet->rcv_saddr)
-		inet->rcv_saddr = rt->rt_src;
-	inet->daddr = rt->rt_dst;
-	inet->dport = usin->sin_port;
+	if (!inet->inet_saddr)
+		inet->inet_saddr = rt->rt_src;	/* Update source address */
+	if (!inet->inet_rcv_saddr)
+		inet->inet_rcv_saddr = rt->rt_src;
+	inet->inet_daddr = rt->rt_dst;
+	inet->inet_dport = usin->sin_port;
 	sk->sk_state = TCP_ESTABLISHED;
-	inet->id = jiffies;
+	inet->inet_id = jiffies;
 
 	sk_dst_set(sk, &rt->u.dst);
 	return(0);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 9139e8f6fdb1..f6a0af759932 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -368,7 +368,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
 			    .proto = sk->sk_protocol,
 			    .flags = inet_sk_flowi_flags(sk),
 			    .uli_u = { .ports =
-				       { .sport = inet_sk(sk)->sport,
+				       { .sport = inet_sk(sk)->inet_sport,
 					 .dport = ireq->rmt_port } } };
 	struct net *net = sock_net(sk);
 
@@ -547,9 +547,9 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
 		newsk->sk_state = TCP_SYN_RECV;
 		newicsk->icsk_bind_hash = NULL;
 
-		inet_sk(newsk)->dport = inet_rsk(req)->rmt_port;
-		inet_sk(newsk)->num = ntohs(inet_rsk(req)->loc_port);
-		inet_sk(newsk)->sport = inet_rsk(req)->loc_port;
+		inet_sk(newsk)->inet_dport = inet_rsk(req)->rmt_port;
+		inet_sk(newsk)->inet_num = ntohs(inet_rsk(req)->loc_port);
+		inet_sk(newsk)->inet_sport = inet_rsk(req)->loc_port;
 		newsk->sk_write_space = sk_stream_write_space;
 
 		newicsk->icsk_retransmits = 0;
@@ -580,8 +580,8 @@ void inet_csk_destroy_sock(struct sock *sk)
 	/* It cannot be in hash table! */
 	WARN_ON(!sk_unhashed(sk));
 
-	/* If it has not 0 inet_sk(sk)->num, it must be bound */
-	WARN_ON(inet_sk(sk)->num && !inet_csk(sk)->icsk_bind_hash);
+	/* If it has not 0 inet_sk(sk)->inet_num, it must be bound */
+	WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash);
 
 	sk->sk_prot->destroy(sk);
 
@@ -616,8 +616,8 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
 	 * after validation is complete.
 	 */
 	sk->sk_state = TCP_LISTEN;
-	if (!sk->sk_prot->get_port(sk, inet->num)) {
-		inet->sport = htons(inet->num);
+	if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
+		inet->inet_sport = htons(inet->inet_num);
 
 		sk_dst_reset(sk);
 		sk->sk_prot->hash(sk);
@@ -693,8 +693,8 @@ void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
 	const struct inet_sock *inet = inet_sk(sk);
 
 	sin->sin_family		= AF_INET;
-	sin->sin_addr.s_addr	= inet->daddr;
-	sin->sin_port		= inet->dport;
+	sin->sin_addr.s_addr	= inet->inet_daddr;
+	sin->sin_port		= inet->inet_dport;
 }
 
 EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index cb73fdefba91..bdb78dd180ce 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -116,10 +116,10 @@ static int inet_csk_diag_fill(struct sock *sk,
 	r->id.idiag_cookie[0] = (u32)(unsigned long)sk;
 	r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
 
-	r->id.idiag_sport = inet->sport;
-	r->id.idiag_dport = inet->dport;
-	r->id.idiag_src[0] = inet->rcv_saddr;
-	r->id.idiag_dst[0] = inet->daddr;
+	r->id.idiag_sport = inet->inet_sport;
+	r->id.idiag_dport = inet->inet_dport;
+	r->id.idiag_src[0] = inet->inet_rcv_saddr;
+	r->id.idiag_dst[0] = inet->inet_daddr;
 
 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
 	if (r->idiag_family == AF_INET6) {
@@ -504,11 +504,11 @@ static int inet_csk_diag_dump(struct sock *sk,
 		} else
 #endif
 		{
-			entry.saddr = &inet->rcv_saddr;
-			entry.daddr = &inet->daddr;
+			entry.saddr = &inet->inet_rcv_saddr;
+			entry.daddr = &inet->inet_daddr;
 		}
-		entry.sport = inet->num;
-		entry.dport = ntohs(inet->dport);
+		entry.sport = inet->inet_num;
+		entry.dport = ntohs(inet->inet_dport);
 		entry.userlocks = sk->sk_userlocks;
 
 		if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
@@ -584,7 +584,7 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
 	if (tmo < 0)
 		tmo = 0;
 
-	r->id.idiag_sport = inet->sport;
+	r->id.idiag_sport = inet->inet_sport;
 	r->id.idiag_dport = ireq->rmt_port;
 	r->id.idiag_src[0] = ireq->loc_addr;
 	r->id.idiag_dst[0] = ireq->rmt_addr;
@@ -639,7 +639,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
 
 	if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
 		bc = (struct rtattr *)(r + 1);
-		entry.sport = inet->num;
+		entry.sport = inet->inet_num;
 		entry.userlocks = sk->sk_userlocks;
 	}
 
@@ -732,7 +732,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
 					continue;
 				}
 
-				if (r->id.idiag_sport != inet->sport &&
+				if (r->id.idiag_sport != inet->inet_sport &&
 				    r->id.idiag_sport)
 					goto next_listen;
 
@@ -797,10 +797,10 @@ skip_listen_ht:
 				goto next_normal;
 			if (!(r->idiag_states & (1 << sk->sk_state)))
 				goto next_normal;
-			if (r->id.idiag_sport != inet->sport &&
+			if (r->id.idiag_sport != inet->inet_sport &&
 			    r->id.idiag_sport)
 				goto next_normal;
-			if (r->id.idiag_dport != inet->dport &&
+			if (r->id.idiag_dport != inet->inet_dport &&
 			    r->id.idiag_dport)
 				goto next_normal;
 			if (inet_csk_diag_dump(sk, skb, cb) < 0) {
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index a45aaf3d48b1..47ad7aab51e3 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -64,7 +64,7 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
 
 	atomic_inc(&hashinfo->bsockets);
 
-	inet_sk(sk)->num = snum;
+	inet_sk(sk)->inet_num = snum;
 	sk_add_bind_node(sk, &tb->owners);
 	tb->num_owners++;
 	inet_csk(sk)->icsk_bind_hash = tb;
@@ -76,7 +76,7 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
 static void __inet_put_port(struct sock *sk)
 {
 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
-	const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->num,
+	const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
 			hashinfo->bhash_size);
 	struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
 	struct inet_bind_bucket *tb;
@@ -88,7 +88,7 @@ static void __inet_put_port(struct sock *sk)
 	__sk_del_bind_node(sk);
 	tb->num_owners--;
 	inet_csk(sk)->icsk_bind_hash = NULL;
-	inet_sk(sk)->num = 0;
+	inet_sk(sk)->inet_num = 0;
 	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
 	spin_unlock(&head->lock);
 }
@@ -105,7 +105,7 @@ EXPORT_SYMBOL(inet_put_port);
 void __inet_inherit_port(struct sock *sk, struct sock *child)
 {
 	struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
-	const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->num,
+	const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->inet_num,
 			table->bhash_size);
 	struct inet_bind_hashbucket *head = &table->bhash[bhash];
 	struct inet_bind_bucket *tb;
@@ -126,9 +126,9 @@ static inline int compute_score(struct sock *sk, struct net *net,
 	int score = -1;
 	struct inet_sock *inet = inet_sk(sk);
 
-	if (net_eq(sock_net(sk), net) && inet->num == hnum &&
+	if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
 			!ipv6_only_sock(sk)) {
-		__be32 rcv_saddr = inet->rcv_saddr;
+		__be32 rcv_saddr = inet->inet_rcv_saddr;
 		score = sk->sk_family == PF_INET ? 1 : 0;
 		if (rcv_saddr) {
 			if (rcv_saddr != daddr)
@@ -273,13 +273,14 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
 {
 	struct inet_hashinfo *hinfo = death_row->hashinfo;
 	struct inet_sock *inet = inet_sk(sk);
-	__be32 daddr = inet->rcv_saddr;
-	__be32 saddr = inet->daddr;
+	__be32 daddr = inet->inet_rcv_saddr;
+	__be32 saddr = inet->inet_daddr;
 	int dif = sk->sk_bound_dev_if;
 	INET_ADDR_COOKIE(acookie, saddr, daddr)
-	const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport);
+	const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
 	struct net *net = sock_net(sk);
-	unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->dport);
+	unsigned int hash = inet_ehashfn(net, daddr, lport,
+					 saddr, inet->inet_dport);
 	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
 	spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
 	struct sock *sk2;
@@ -312,8 +313,8 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
 unique:
 	/* Must record num and sport now. Otherwise we will see
 	 * in hash table socket with a funny identity. */
-	inet->num = lport;
-	inet->sport = htons(lport);
+	inet->inet_num = lport;
+	inet->inet_sport = htons(lport);
 	sk->sk_hash = hash;
 	WARN_ON(!sk_unhashed(sk));
 	__sk_nulls_add_node_rcu(sk, &head->chain);
@@ -341,8 +342,9 @@ not_unique:
 static inline u32 inet_sk_port_offset(const struct sock *sk)
 {
 	const struct inet_sock *inet = inet_sk(sk);
-	return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr,
-					  inet->dport);
+	return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
+					  inet->inet_daddr,
+					  inet->inet_dport);
 }
 
 void __inet_hash_nolisten(struct sock *sk)
@@ -424,7 +426,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 		void (*hash)(struct sock *sk))
 {
 	struct inet_hashinfo *hinfo = death_row->hashinfo;
-	const unsigned short snum = inet_sk(sk)->num;
+	const unsigned short snum = inet_sk(sk)->inet_num;
 	struct inet_bind_hashbucket *head;
 	struct inet_bind_bucket *tb;
 	int ret;
@@ -485,7 +487,7 @@ ok:
 		/* Head lock still held and bh's disabled */
 		inet_bind_hash(sk, tb, port);
 		if (sk_unhashed(sk)) {
-			inet_sk(sk)->sport = htons(port);
+			inet_sk(sk)->inet_sport = htons(port);
 			hash(sk);
 		}
 		spin_unlock(&head->lock);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 2fe571155b22..1f5d508bb18b 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -86,7 +86,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
 	   Note, that any socket with inet->num != 0 MUST be bound in
 	   binding cache, even if it is closed.
 	 */
-	bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->num,
+	bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
 			hashinfo->bhash_size)];
 	spin_lock(&bhead->lock);
 	tw->tw_tb = icsk->icsk_bind_hash;
@@ -124,14 +124,14 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
 		kmemcheck_annotate_bitfield(tw, flags);
 
 		/* Give us an identity. */
-		tw->tw_daddr	    = inet->daddr;
-		tw->tw_rcv_saddr    = inet->rcv_saddr;
+		tw->tw_daddr	    = inet->inet_daddr;
+		tw->tw_rcv_saddr    = inet->inet_rcv_saddr;
 		tw->tw_bound_dev_if = sk->sk_bound_dev_if;
-		tw->tw_num	    = inet->num;
+		tw->tw_num	    = inet->inet_num;
 		tw->tw_state	    = TCP_TIME_WAIT;
 		tw->tw_substate	    = state;
-		tw->tw_sport	    = inet->sport;
-		tw->tw_dport	    = inet->dport;
+		tw->tw_sport	    = inet->inet_sport;
+		tw->tw_dport	    = inet->inet_dport;
 		tw->tw_family	    = sk->sk_family;
 		tw->tw_reuse	    = sk->sk_reuse;
 		tw->tw_hash	    = sk->sk_hash;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 6c98b43badf4..fdf51badc8e5 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -161,7 +161,7 @@ int ip_call_ra_chain(struct sk_buff *skb)
 		/* If socket is bound to an interface, only report
 		 * the packet if it came  from that interface.
 		 */
-		if (sk && inet_sk(sk)->num == protocol &&
+		if (sk && inet_sk(sk)->inet_num == protocol &&
 		    (!sk->sk_bound_dev_if ||
 		     sk->sk_bound_dev_if == dev->ifindex) &&
 		    sock_net(sk) == dev_net(dev)) {
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index f9895180f481..322b40864ac0 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -329,7 +329,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 		__be32 daddr;
 
 		/* Use correct destination address if we have options. */
-		daddr = inet->daddr;
+		daddr = inet->inet_daddr;
 		if(opt && opt->srr)
 			daddr = opt->faddr;
 
@@ -338,13 +338,13 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 					    .mark = sk->sk_mark,
 					    .nl_u = { .ip4_u =
 						      { .daddr = daddr,
-							.saddr = inet->saddr,
+							.saddr = inet->inet_saddr,
 							.tos = RT_CONN_FLAGS(sk) } },
 					    .proto = sk->sk_protocol,
 					    .flags = inet_sk_flowi_flags(sk),
 					    .uli_u = { .ports =
-						       { .sport = inet->sport,
-							 .dport = inet->dport } } };
+						       { .sport = inet->inet_sport,
+							 .dport = inet->inet_dport } } };
 
 			/* If this fails, retransmit mechanism of transport layer will
 			 * keep trying until route appears or the connection times
@@ -379,7 +379,7 @@ packet_routed:
 
 	if (opt && opt->optlen) {
 		iph->ihl += opt->optlen >> 2;
-		ip_options_build(skb, opt, inet->daddr, rt, 0);
+		ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
 	}
 
 	ip_select_ident_more(iph, &rt->u.dst, sk,
@@ -846,7 +846,8 @@ int ip_append_data(struct sock *sk,
 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 
 	if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
-		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
+		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
+			       mtu-exthdrlen);
 		return -EMSGSIZE;
 	}
 
@@ -1100,7 +1101,7 @@ ssize_t	ip_append_page(struct sock *sk, struct page *page,
 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 
 	if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
-		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
+		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
 		return -EMSGSIZE;
 	}
 
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 0c0b6e363a20..c602d985fe14 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -245,7 +245,7 @@ int ip_ra_control(struct sock *sk, unsigned char on,
 {
 	struct ip_ra_chain *ra, *new_ra, **rap;
 
-	if (sk->sk_type != SOCK_RAW || inet_sk(sk)->num == IPPROTO_RAW)
+	if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
 		return -EINVAL;
 
 	new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
@@ -492,7 +492,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 			if (sk->sk_family == PF_INET ||
 			    (!((1 << sk->sk_state) &
 			       (TCPF_LISTEN | TCPF_CLOSE)) &&
-			     inet->daddr != LOOPBACK4_IPV6)) {
+			     inet->inet_daddr != LOOPBACK4_IPV6)) {
 #endif
 				if (inet->opt)
 					icsk->icsk_ext_hdr_len -= inet->opt->optlen;
@@ -1181,8 +1181,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 		if (inet->cmsg_flags & IP_CMSG_PKTINFO) {
 			struct in_pktinfo info;
 
-			info.ipi_addr.s_addr = inet->rcv_saddr;
-			info.ipi_spec_dst.s_addr = inet->rcv_saddr;
+			info.ipi_addr.s_addr = inet->inet_rcv_saddr;
+			info.ipi_spec_dst.s_addr = inet->inet_rcv_saddr;
 			info.ipi_ifindex = inet->mc_index;
 			put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
 		}
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index c757f0b4b74c..694974502ea6 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -956,7 +956,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 	switch (optname) {
 	case MRT_INIT:
 		if (sk->sk_type != SOCK_RAW ||
-		    inet_sk(sk)->num != IPPROTO_IGMP)
+		    inet_sk(sk)->inet_num != IPPROTO_IGMP)
 			return -EOPNOTSUPP;
 		if (optlen != sizeof(int))
 			return -ENOPROTOOPT;
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index aa95bb82ee6c..9cd423ffafa8 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -255,10 +255,10 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
 	struct nf_conntrack_tuple tuple;
 
 	memset(&tuple, 0, sizeof(tuple));
-	tuple.src.u3.ip = inet->rcv_saddr;
-	tuple.src.u.tcp.port = inet->sport;
-	tuple.dst.u3.ip = inet->daddr;
-	tuple.dst.u.tcp.port = inet->dport;
+	tuple.src.u3.ip = inet->inet_rcv_saddr;
+	tuple.src.u.tcp.port = inet->inet_sport;
+	tuple.dst.u3.ip = inet->inet_daddr;
+	tuple.dst.u.tcp.port = inet->inet_dport;
 	tuple.src.l3num = PF_INET;
 	tuple.dst.protonum = sk->sk_protocol;
 
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 39e2a6b8752c..9ef8c0829a77 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -87,7 +87,7 @@ void raw_hash_sk(struct sock *sk)
 	struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
 	struct hlist_head *head;
 
-	head = &h->ht[inet_sk(sk)->num & (RAW_HTABLE_SIZE - 1)];
+	head = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)];
 
 	write_lock_bh(&h->lock);
 	sk_add_node(sk, head);
@@ -115,9 +115,9 @@ static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
 	sk_for_each_from(sk, node) {
 		struct inet_sock *inet = inet_sk(sk);
 
-		if (net_eq(sock_net(sk), net) && inet->num == num	&&
-		    !(inet->daddr && inet->daddr != raddr) 		&&
-		    !(inet->rcv_saddr && inet->rcv_saddr != laddr)	&&
+		if (net_eq(sock_net(sk), net) && inet->inet_num == num	&&
+		    !(inet->inet_daddr && inet->inet_daddr != raddr) 	&&
+		    !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
 		    !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
 			goto found; /* gotcha */
 	}
@@ -326,7 +326,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
 	int err;
 
 	if (length > rt->u.dst.dev->mtu) {
-		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport,
+		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
 			       rt->u.dst.dev->mtu);
 		return -EMSGSIZE;
 	}
@@ -489,10 +489,10 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		err = -EDESTADDRREQ;
 		if (sk->sk_state != TCP_ESTABLISHED)
 			goto out;
-		daddr = inet->daddr;
+		daddr = inet->inet_daddr;
 	}
 
-	ipc.addr = inet->saddr;
+	ipc.addr = inet->inet_saddr;
 	ipc.opt = NULL;
 	ipc.shtx.flags = 0;
 	ipc.oif = sk->sk_bound_dev_if;
@@ -634,9 +634,9 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
 	    chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
 		goto out;
-	inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;
+	inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
 	if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
-		inet->saddr = 0;  /* Use device */
+		inet->inet_saddr = 0;  /* Use device */
 	sk_dst_reset(sk);
 	ret = 0;
 out:	return ret;
@@ -706,7 +706,7 @@ static int raw_init(struct sock *sk)
 {
 	struct raw_sock *rp = raw_sk(sk);
 
-	if (inet_sk(sk)->num == IPPROTO_ICMP)
+	if (inet_sk(sk)->inet_num == IPPROTO_ICMP)
 		memset(&rp->filter, 0, sizeof(rp->filter));
 	return 0;
 }
@@ -743,7 +743,7 @@ static int do_raw_setsockopt(struct sock *sk, int level, int optname,
 			  char __user *optval, unsigned int optlen)
 {
 	if (optname == ICMP_FILTER) {
-		if (inet_sk(sk)->num != IPPROTO_ICMP)
+		if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
 			return -EOPNOTSUPP;
 		else
 			return raw_seticmpfilter(sk, optval, optlen);
@@ -773,7 +773,7 @@ static int do_raw_getsockopt(struct sock *sk, int level, int optname,
 			  char __user *optval, int __user *optlen)
 {
 	if (optname == ICMP_FILTER) {
-		if (inet_sk(sk)->num != IPPROTO_ICMP)
+		if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
 			return -EOPNOTSUPP;
 		else
 			return raw_geticmpfilter(sk, optval, optlen);
@@ -932,10 +932,10 @@ EXPORT_SYMBOL_GPL(raw_seq_stop);
 static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
 {
 	struct inet_sock *inet = inet_sk(sp);
-	__be32 dest = inet->daddr,
-	       src = inet->rcv_saddr;
+	__be32 dest = inet->inet_daddr,
+	       src = inet->inet_rcv_saddr;
 	__u16 destp = 0,
-	      srcp  = inet->num;
+	      srcp  = inet->inet_num;
 
 	seq_printf(seq, "%4d: %08X:%04X %08X:%04X"
 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index cf13726259cd..206a291dff03 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1998,7 +1998,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	__skb_queue_purge(&sk->sk_async_wait_queue);
 #endif
 
-	inet->dport = 0;
+	inet->inet_dport = 0;
 
 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
 		inet_reset_saddr(sk);
@@ -2022,7 +2022,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
 	__sk_dst_reset(sk);
 
-	WARN_ON(inet->num && !icsk->icsk_bind_hash);
+	WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
 
 	sk->sk_error_report(sk);
 	return err;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 99718703d040..a4a3390a5287 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -165,10 +165,10 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		nexthop = inet->opt->faddr;
 	}
 
-	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
+	tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
 			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 			       IPPROTO_TCP,
-			       inet->sport, usin->sin_port, sk, 1);
+			       inet->inet_sport, usin->sin_port, sk, 1);
 	if (tmp < 0) {
 		if (tmp == -ENETUNREACH)
 			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
@@ -183,11 +183,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	if (!inet->opt || !inet->opt->srr)
 		daddr = rt->rt_dst;
 
-	if (!inet->saddr)
-		inet->saddr = rt->rt_src;
-	inet->rcv_saddr = inet->saddr;
+	if (!inet->inet_saddr)
+		inet->inet_saddr = rt->rt_src;
+	inet->inet_rcv_saddr = inet->inet_saddr;
 
-	if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
+	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 		/* Reset inherited state */
 		tp->rx_opt.ts_recent	   = 0;
 		tp->rx_opt.ts_recent_stamp = 0;
@@ -210,8 +210,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		}
 	}
 
-	inet->dport = usin->sin_port;
-	inet->daddr = daddr;
+	inet->inet_dport = usin->sin_port;
+	inet->inet_daddr = daddr;
 
 	inet_csk(sk)->icsk_ext_hdr_len = 0;
 	if (inet->opt)
@@ -230,7 +230,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		goto failure;
 
 	err = ip_route_newports(&rt, IPPROTO_TCP,
-				inet->sport, inet->dport, sk);
+				inet->inet_sport, inet->inet_dport, sk);
 	if (err)
 		goto failure;
 
@@ -239,12 +239,12 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	sk_setup_caps(sk, &rt->u.dst);
 
 	if (!tp->write_seq)
-		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
-							   inet->daddr,
-							   inet->sport,
+		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
+							   inet->inet_daddr,
+							   inet->inet_sport,
 							   usin->sin_port);
 
-	inet->id = tp->write_seq ^ jiffies;
+	inet->inet_id = tp->write_seq ^ jiffies;
 
 	err = tcp_connect(sk);
 	rt = NULL;
@@ -261,7 +261,7 @@ failure:
 	tcp_set_state(sk, TCP_CLOSE);
 	ip_rt_put(rt);
 	sk->sk_route_caps = 0;
-	inet->dport = 0;
+	inet->inet_dport = 0;
 	return err;
 }
 
@@ -520,12 +520,13 @@ void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
 	struct tcphdr *th = tcp_hdr(skb);
 
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
-		th->check = ~tcp_v4_check(len, inet->saddr,
-					  inet->daddr, 0);
+		th->check = ~tcp_v4_check(len, inet->inet_saddr,
+					  inet->inet_daddr, 0);
 		skb->csum_start = skb_transport_header(skb) - skb->head;
 		skb->csum_offset = offsetof(struct tcphdr, check);
 	} else {
-		th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
+		th->check = tcp_v4_check(len, inet->inet_saddr,
+					 inet->inet_daddr,
 					 csum_partial(th,
 						      th->doff << 2,
 						      skb->csum));
@@ -848,7 +849,7 @@ static struct tcp_md5sig_key *
 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 					 struct sock *addr_sk)
 {
-	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
+	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
 }
 
 EXPORT_SYMBOL(tcp_v4_md5_lookup);
@@ -923,7 +924,7 @@ EXPORT_SYMBOL(tcp_v4_md5_do_add);
 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 			       u8 *newkey, u8 newkeylen)
 {
-	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
+	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
 				 newkey, newkeylen);
 }
 
@@ -1089,8 +1090,8 @@ int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
 	__be32 saddr, daddr;
 
 	if (sk) {
-		saddr = inet_sk(sk)->saddr;
-		daddr = inet_sk(sk)->daddr;
+		saddr = inet_sk(sk)->inet_saddr;
+		daddr = inet_sk(sk)->inet_daddr;
 	} else if (req) {
 		saddr = inet_rsk(req)->loc_addr;
 		daddr = inet_rsk(req)->rmt_addr;
@@ -1380,9 +1381,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	newtp		      = tcp_sk(newsk);
 	newinet		      = inet_sk(newsk);
 	ireq		      = inet_rsk(req);
-	newinet->daddr	      = ireq->rmt_addr;
-	newinet->rcv_saddr    = ireq->loc_addr;
-	newinet->saddr	      = ireq->loc_addr;
+	newinet->inet_daddr   = ireq->rmt_addr;
+	newinet->inet_rcv_saddr = ireq->loc_addr;
+	newinet->inet_saddr	      = ireq->loc_addr;
 	newinet->opt	      = ireq->opt;
 	ireq->opt	      = NULL;
 	newinet->mc_index     = inet_iif(skb);
@@ -1390,7 +1391,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
 	if (newinet->opt)
 		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
-	newinet->id = newtp->write_seq ^ jiffies;
+	newinet->inet_id = newtp->write_seq ^ jiffies;
 
 	tcp_mtup_init(newsk);
 	tcp_sync_mss(newsk, dst_mtu(dst));
@@ -1403,7 +1404,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 #ifdef CONFIG_TCP_MD5SIG
 	/* Copy over the MD5 key from the original socket */
-	if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
+	key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
+	if (key != NULL) {
 		/*
 		 * We're using one, so create a matching key
 		 * on the newsk structure. If we fail to get
@@ -1412,7 +1414,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		 */
 		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
 		if (newkey != NULL)
-			tcp_v4_md5_do_add(newsk, newinet->daddr,
+			tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
 					  newkey, key->keylen);
 		newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
 	}
@@ -1711,8 +1713,8 @@ int tcp_v4_remember_stamp(struct sock *sk)
 	struct inet_peer *peer = NULL;
 	int release_it = 0;
 
-	if (!rt || rt->rt_dst != inet->daddr) {
-		peer = inet_getpeer(inet->daddr, 1);
+	if (!rt || rt->rt_dst != inet->inet_daddr) {
+		peer = inet_getpeer(inet->inet_daddr, 1);
 		release_it = 1;
 	} else {
 		if (!rt->peer)
@@ -2225,7 +2227,7 @@ static void get_openreq4(struct sock *sk, struct request_sock *req,
 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
 		i,
 		ireq->loc_addr,
-		ntohs(inet_sk(sk)->sport),
+		ntohs(inet_sk(sk)->inet_sport),
 		ireq->rmt_addr,
 		ntohs(ireq->rmt_port),
 		TCP_SYN_RECV,
@@ -2248,10 +2250,10 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
 	struct tcp_sock *tp = tcp_sk(sk);
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct inet_sock *inet = inet_sk(sk);
-	__be32 dest = inet->daddr;
-	__be32 src = inet->rcv_saddr;
-	__u16 destp = ntohs(inet->dport);
-	__u16 srcp = ntohs(inet->sport);
+	__be32 dest = inet->inet_daddr;
+	__be32 src = inet->inet_rcv_saddr;
+	__u16 destp = ntohs(inet->inet_dport);
+	__u16 srcp = ntohs(inet->inet_sport);
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
 		timer_active	= 1;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fcd278a7080e..2e2eb74ac4cc 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -661,8 +661,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 
 	/* Build TCP header and checksum it. */
 	th = tcp_hdr(skb);
-	th->source		= inet->sport;
-	th->dest		= inet->dport;
+	th->source		= inet->inet_sport;
+	th->dest		= inet->inet_dport;
 	th->seq			= htonl(tcb->seq);
 	th->ack_seq		= htonl(tp->rcv_nxt);
 	*(((__be16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) |
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 59f5b5e7c566..7a3cc2ffad84 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -94,7 +94,8 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	const struct inet_sock *inet = inet_sk(sk);
 
 	/* Only update if port matches */
-	if ((port == 0 || ntohs(inet->dport) == port || ntohs(inet->sport) == port)
+	if ((port == 0 || ntohs(inet->inet_dport) == port ||
+	     ntohs(inet->inet_sport) == port)
 	    && (full || tp->snd_cwnd != tcp_probe.lastcwnd)) {
 
 		spin_lock(&tcp_probe.lock);
@@ -103,10 +104,10 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			struct tcp_log *p = tcp_probe.log + tcp_probe.head;
 
 			p->tstamp = ktime_get();
-			p->saddr = inet->saddr;
-			p->sport = inet->sport;
-			p->daddr = inet->daddr;
-			p->dport = inet->dport;
+			p->saddr = inet->inet_saddr;
+			p->sport = inet->inet_sport;
+			p->daddr = inet->inet_daddr;
+			p->dport = inet->inet_dport;
 			p->length = skb->len;
 			p->snd_nxt = tp->snd_nxt;
 			p->snd_una = tp->snd_una;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index cdb2ca7684d4..6e8996cb79d0 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -303,15 +303,15 @@ void tcp_retransmit_timer(struct sock *sk)
 		struct inet_sock *inet = inet_sk(sk);
 		if (sk->sk_family == AF_INET) {
 			LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
-			       &inet->daddr, ntohs(inet->dport),
-			       inet->num, tp->snd_una, tp->snd_nxt);
+			       &inet->inet_daddr, ntohs(inet->inet_dport),
+			       inet->inet_num, tp->snd_una, tp->snd_nxt);
 		}
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 		else if (sk->sk_family == AF_INET6) {
 			struct ipv6_pinfo *np = inet6_sk(sk);
 			LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
-			       &np->daddr, ntohs(inet->dport),
-			       inet->num, tp->snd_una, tp->snd_nxt);
+			       &np->daddr, ntohs(inet->inet_dport),
+			       inet->inet_num, tp->snd_una, tp->snd_nxt);
 		}
 #endif
 #endif
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 45a8a7e374d8..ec5c7a720c0c 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -214,7 +214,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 			goto fail_unlock;
 	}
 found:
-	inet_sk(sk)->num = snum;
+	inet_sk(sk)->inet_num = snum;
 	sk->sk_hash = snum;
 	if (sk_unhashed(sk)) {
 		sk_nulls_add_node_rcu(sk, &hslot->head);
@@ -233,8 +233,8 @@ static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
 	struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
 
 	return 	(!ipv6_only_sock(sk2)  &&
-		 (!inet1->rcv_saddr || !inet2->rcv_saddr ||
-		   inet1->rcv_saddr == inet2->rcv_saddr));
+		 (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr ||
+		   inet1->inet_rcv_saddr == inet2->inet_rcv_saddr));
 }
 
 int udp_v4_get_port(struct sock *sk, unsigned short snum)
@@ -253,18 +253,18 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
 		struct inet_sock *inet = inet_sk(sk);
 
 		score = (sk->sk_family == PF_INET ? 1 : 0);
-		if (inet->rcv_saddr) {
-			if (inet->rcv_saddr != daddr)
+		if (inet->inet_rcv_saddr) {
+			if (inet->inet_rcv_saddr != daddr)
 				return -1;
 			score += 2;
 		}
-		if (inet->daddr) {
-			if (inet->daddr != saddr)
+		if (inet->inet_daddr) {
+			if (inet->inet_daddr != saddr)
 				return -1;
 			score += 2;
 		}
-		if (inet->dport) {
-			if (inet->dport != sport)
+		if (inet->inet_dport) {
+			if (inet->inet_dport != sport)
 				return -1;
 			score += 2;
 		}
@@ -360,9 +360,10 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
 
 		if (!net_eq(sock_net(s), net)				||
 		    s->sk_hash != hnum					||
-		    (inet->daddr && inet->daddr != rmt_addr)		||
-		    (inet->dport != rmt_port && inet->dport)		||
-		    (inet->rcv_saddr && inet->rcv_saddr != loc_addr)	||
+		    (inet->inet_daddr && inet->inet_daddr != rmt_addr)	||
+		    (inet->inet_dport != rmt_port && inet->inet_dport)	||
+		    (inet->inet_rcv_saddr	&&
+		     inet->inet_rcv_saddr != loc_addr)			||
 		    ipv6_only_sock(s)					||
 		    (s->sk_bound_dev_if && s->sk_bound_dev_if != dif))
 			continue;
@@ -646,14 +647,14 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	} else {
 		if (sk->sk_state != TCP_ESTABLISHED)
 			return -EDESTADDRREQ;
-		daddr = inet->daddr;
-		dport = inet->dport;
+		daddr = inet->inet_daddr;
+		dport = inet->inet_dport;
 		/* Open fast path for connected socket.
 		   Route will not be used, if at least one option is set.
 		 */
 		connected = 1;
 	}
-	ipc.addr = inet->saddr;
+	ipc.addr = inet->inet_saddr;
 
 	ipc.oif = sk->sk_bound_dev_if;
 	err = sock_tx_timestamp(msg, sk, &ipc.shtx);
@@ -708,7 +709,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 				    .proto = sk->sk_protocol,
 				    .flags = inet_sk_flowi_flags(sk),
 				    .uli_u = { .ports =
-					       { .sport = inet->sport,
+					       { .sport = inet->inet_sport,
 						 .dport = dport } } };
 		struct net *net = sock_net(sk);
 
@@ -752,7 +753,7 @@ back_from_confirm:
 	inet->cork.fl.fl4_dst = daddr;
 	inet->cork.fl.fl_ip_dport = dport;
 	inet->cork.fl.fl4_src = saddr;
-	inet->cork.fl.fl_ip_sport = inet->sport;
+	inet->cork.fl.fl_ip_sport = inet->inet_sport;
 	up->pending = AF_INET;
 
 do_append_data:
@@ -1029,15 +1030,15 @@ int udp_disconnect(struct sock *sk, int flags)
 	 */
 
 	sk->sk_state = TCP_CLOSE;
-	inet->daddr = 0;
-	inet->dport = 0;
+	inet->inet_daddr = 0;
+	inet->inet_dport = 0;
 	sk->sk_bound_dev_if = 0;
 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
 		inet_reset_saddr(sk);
 
 	if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
 		sk->sk_prot->unhash(sk);
-		inet->sport = 0;
+		inet->inet_sport = 0;
 	}
 	sk_dst_reset(sk);
 	return 0;
@@ -1053,7 +1054,7 @@ void udp_lib_unhash(struct sock *sk)
 
 		spin_lock_bh(&hslot->lock);
 		if (sk_nulls_del_node_init_rcu(sk)) {
-			inet_sk(sk)->num = 0;
+			inet_sk(sk)->inet_num = 0;
 			sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 		}
 		spin_unlock_bh(&hslot->lock);
@@ -1752,10 +1753,10 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
 		int bucket, int *len)
 {
 	struct inet_sock *inet = inet_sk(sp);
-	__be32 dest = inet->daddr;
-	__be32 src  = inet->rcv_saddr;
-	__u16 destp	  = ntohs(inet->dport);
-	__u16 srcp	  = ntohs(inet->sport);
+	__be32 dest = inet->inet_daddr;
+	__be32 src  = inet->inet_rcv_saddr;
+	__u16 destp	  = ntohs(inet->inet_dport);
+	__u16 srcp	  = ntohs(inet->inet_sport);
 
 	seq_printf(f, "%5d: %08X:%04X %08X:%04X"
 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n",
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 94216519873c..b6d058818673 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -185,7 +185,7 @@ lookup_protocol:
 	inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
 
 	if (SOCK_RAW == sock->type) {
-		inet->num = protocol;
+		inet->inet_num = protocol;
 		if (IPPROTO_RAW == protocol)
 			inet->hdrincl = 1;
 	}
@@ -228,12 +228,12 @@ lookup_protocol:
 	 */
 	sk_refcnt_debug_inc(sk);
 
-	if (inet->num) {
+	if (inet->inet_num) {
 		/* It assumes that any protocol which allows
 		 * the user to assign a number at socket
 		 * creation time automatically shares.
 		 */
-		inet->sport = htons(inet->num);
+		inet->inet_sport = htons(inet->inet_num);
 		sk->sk_prot->hash(sk);
 	}
 	if (sk->sk_prot->init) {
@@ -281,7 +281,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	lock_sock(sk);
 
 	/* Check these errors (active socket, double bind). */
-	if (sk->sk_state != TCP_CLOSE || inet->num) {
+	if (sk->sk_state != TCP_CLOSE || inet->inet_num) {
 		err = -EINVAL;
 		goto out;
 	}
@@ -353,8 +353,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		}
 	}
 
-	inet->rcv_saddr = v4addr;
-	inet->saddr = v4addr;
+	inet->inet_rcv_saddr = v4addr;
+	inet->inet_saddr = v4addr;
 
 	ipv6_addr_copy(&np->rcv_saddr, &addr->sin6_addr);
 
@@ -375,9 +375,9 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	}
 	if (snum)
 		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
-	inet->sport = htons(inet->num);
-	inet->dport = 0;
-	inet->daddr = 0;
+	inet->inet_sport = htons(inet->inet_num);
+	inet->inet_dport = 0;
+	inet->inet_daddr = 0;
 out:
 	release_sock(sk);
 	return err;
@@ -441,12 +441,12 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 	sin->sin6_flowinfo = 0;
 	sin->sin6_scope_id = 0;
 	if (peer) {
-		if (!inet->dport)
+		if (!inet->inet_dport)
 			return -ENOTCONN;
 		if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
 		    peer == 1)
 			return -ENOTCONN;
-		sin->sin6_port = inet->dport;
+		sin->sin6_port = inet->inet_dport;
 		ipv6_addr_copy(&sin->sin6_addr, &np->daddr);
 		if (np->sndflow)
 			sin->sin6_flowinfo = np->flow_label;
@@ -456,7 +456,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 		else
 			ipv6_addr_copy(&sin->sin6_addr, &np->rcv_saddr);
 
-		sin->sin6_port = inet->sport;
+		sin->sin6_port = inet->inet_sport;
 	}
 	if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL)
 		sin->sin6_scope_id = sk->sk_bound_dev_if;
@@ -655,8 +655,8 @@ int inet6_sk_rebuild_header(struct sock *sk)
 		fl.fl6_flowlabel = np->flow_label;
 		fl.oif = sk->sk_bound_dev_if;
 		fl.mark = sk->sk_mark;
-		fl.fl_ip_dport = inet->dport;
-		fl.fl_ip_sport = inet->sport;
+		fl.fl_ip_dport = inet->inet_dport;
+		fl.fl_ip_sport = inet->inet_sport;
 		security_sk_classify_flow(sk, &fl);
 
 		if (np->opt && np->opt->srcrt) {
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index dbfec7147aa5..9f70452a69e7 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -98,13 +98,14 @@ ipv4_connected:
 		if (err)
 			goto out;
 
-		ipv6_addr_set_v4mapped(inet->daddr, &np->daddr);
+		ipv6_addr_set_v4mapped(inet->inet_daddr, &np->daddr);
 
 		if (ipv6_addr_any(&np->saddr))
-			ipv6_addr_set_v4mapped(inet->saddr, &np->saddr);
+			ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr);
 
 		if (ipv6_addr_any(&np->rcv_saddr))
-			ipv6_addr_set_v4mapped(inet->rcv_saddr, &np->rcv_saddr);
+			ipv6_addr_set_v4mapped(inet->inet_rcv_saddr,
+					       &np->rcv_saddr);
 
 		goto out;
 	}
@@ -133,7 +134,7 @@ ipv4_connected:
 	ipv6_addr_copy(&np->daddr, daddr);
 	np->flow_label = fl.fl6_flowlabel;
 
-	inet->dport = usin->sin6_port;
+	inet->inet_dport = usin->sin6_port;
 
 	/*
 	 *	Check for a route to destination an obtain the
@@ -145,8 +146,8 @@ ipv4_connected:
 	ipv6_addr_copy(&fl.fl6_src, &np->saddr);
 	fl.oif = sk->sk_bound_dev_if;
 	fl.mark = sk->sk_mark;
-	fl.fl_ip_dport = inet->dport;
-	fl.fl_ip_sport = inet->sport;
+	fl.fl_ip_dport = inet->inet_dport;
+	fl.fl_ip_sport = inet->inet_sport;
 
 	if (!fl.oif && (addr_type&IPV6_ADDR_MULTICAST))
 		fl.oif = np->mcast_oif;
@@ -188,7 +189,7 @@ ipv4_connected:
 
 	if (ipv6_addr_any(&np->rcv_saddr)) {
 		ipv6_addr_copy(&np->rcv_saddr, &fl.fl6_src);
-		inet->rcv_saddr = LOOPBACK4_IPV6;
+		inet->inet_rcv_saddr = LOOPBACK4_IPV6;
 	}
 
 	ip6_dst_store(sk, dst,
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index a9f4a21b31ea..19dceef4fcca 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -132,7 +132,7 @@ void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
 
 	sin6->sin6_family = AF_INET6;
 	ipv6_addr_copy(&sin6->sin6_addr, &np->daddr);
-	sin6->sin6_port	= inet_sk(sk)->dport;
+	sin6->sin6_port	= inet_sk(sk)->inet_dport;
 	/* We do not store received flowlabel for TCP */
 	sin6->sin6_flowinfo = 0;
 	sin6->sin6_scope_id = 0;
@@ -195,8 +195,8 @@ int inet6_csk_xmit(struct sk_buff *skb, int ipfragok)
 	IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel);
 	fl.oif = sk->sk_bound_dev_if;
 	fl.mark = sk->sk_mark;
-	fl.fl_ip_sport = inet->sport;
-	fl.fl_ip_dport = inet->dport;
+	fl.fl_ip_sport = inet->inet_sport;
+	fl.fl_ip_dport = inet->inet_dport;
 	security_sk_classify_flow(sk, &fl);
 
 	if (np->opt && np->opt->srcrt) {
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 874aed86e1a2..00c6a3e6cddf 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -125,7 +125,7 @@ static int inline compute_score(struct sock *sk, struct net *net,
 {
 	int score = -1;
 
-	if (net_eq(sock_net(sk), net) && inet_sk(sk)->num == hnum &&
+	if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum &&
 	    sk->sk_family == PF_INET6) {
 		const struct ipv6_pinfo *np = inet6_sk(sk);
 
@@ -214,10 +214,10 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
 	const struct in6_addr *daddr = &np->rcv_saddr;
 	const struct in6_addr *saddr = &np->daddr;
 	const int dif = sk->sk_bound_dev_if;
-	const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport);
+	const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
 	struct net *net = sock_net(sk);
 	const unsigned int hash = inet6_ehashfn(net, daddr, lport, saddr,
-						inet->dport);
+						inet->inet_dport);
 	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
 	spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
 	struct sock *sk2;
@@ -248,8 +248,8 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
 unique:
 	/* Must record num and sport now. Otherwise we will see
 	 * in hash table socket with a funny identity. */
-	inet->num = lport;
-	inet->sport = htons(lport);
+	inet->inet_num = lport;
+	inet->inet_sport = htons(lport);
 	WARN_ON(!sk_unhashed(sk));
 	__sk_nulls_add_node_rcu(sk, &head->chain);
 	sk->sk_hash = hash;
@@ -279,7 +279,7 @@ static inline u32 inet6_sk_port_offset(const struct sock *sk)
 	const struct ipv6_pinfo *np = inet6_sk(sk);
 	return secure_ipv6_port_ephemeral(np->rcv_saddr.s6_addr32,
 					  np->daddr.s6_addr32,
-					  inet->dport);
+					  inet->inet_dport);
 }
 
 int inet6_hash_connect(struct inet_timewait_death_row *death_row,
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 716153941fc4..85849b4f5a36 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1297,7 +1297,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 	switch (optname) {
 	case MRT6_INIT:
 		if (sk->sk_type != SOCK_RAW ||
-		    inet_sk(sk)->num != IPPROTO_ICMPV6)
+		    inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
 			return -EOPNOTSUPP;
 		if (optlen < sizeof(int))
 			return -EINVAL;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index dc0f7366073d..39e10ac88019 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -64,7 +64,7 @@ int ip6_ra_control(struct sock *sk, int sel)
 	struct ip6_ra_chain *ra, *new_ra, **rap;
 
 	/* RA packet may be delivered ONLY to IPPROTO_RAW socket */
-	if (sk->sk_type != SOCK_RAW || inet_sk(sk)->num != IPPROTO_RAW)
+	if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_RAW)
 		return -ENOPROTOOPT;
 
 	new_ra = (sel>=0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
@@ -106,7 +106,7 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
 	if (inet_sk(sk)->is_icsk) {
 		if (opt &&
 		    !((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) &&
-		    inet_sk(sk)->daddr != LOOPBACK4_IPV6) {
+		    inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) {
 			struct inet_connection_sock *icsk = inet_csk(sk);
 			icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen;
 			icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
@@ -234,7 +234,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 
 	case IPV6_V6ONLY:
 		if (optlen < sizeof(int) ||
-		    inet_sk(sk)->num)
+		    inet_sk(sk)->inet_num)
 			goto e_inval;
 		np->ipv6only = valbool;
 		retv = 0;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index fd737efed96c..52ed7d7f9dab 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -72,7 +72,7 @@ static struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
 	int is_multicast = ipv6_addr_is_multicast(loc_addr);
 
 	sk_for_each_from(sk, node)
-		if (inet_sk(sk)->num == num) {
+		if (inet_sk(sk)->inet_num == num) {
 			struct ipv6_pinfo *np = inet6_sk(sk);
 
 			if (!net_eq(sock_net(sk), net))
@@ -298,7 +298,7 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 			dev_put(dev);
 	}
 
-	inet->rcv_saddr = inet->saddr = v4addr;
+	inet->inet_rcv_saddr = inet->inet_saddr = v4addr;
 	ipv6_addr_copy(&np->rcv_saddr, &addr->sin6_addr);
 	if (!(addr_type & IPV6_ADDR_MULTICAST))
 		ipv6_addr_copy(&np->saddr, &addr->sin6_addr);
@@ -415,14 +415,14 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
 				   skb_network_header_len(skb));
 		if (!csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
 				     &ipv6_hdr(skb)->daddr,
-				     skb->len, inet->num, skb->csum))
+				     skb->len, inet->inet_num, skb->csum))
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 	}
 	if (!skb_csum_unnecessary(skb))
 		skb->csum = ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
 							 &ipv6_hdr(skb)->daddr,
 							 skb->len,
-							 inet->num, 0));
+							 inet->inet_num, 0));
 
 	if (inet->hdrincl) {
 		if (skb_checksum_complete(skb)) {
@@ -765,8 +765,8 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
 		proto = ntohs(sin6->sin6_port);
 
 		if (!proto)
-			proto = inet->num;
-		else if (proto != inet->num)
+			proto = inet->inet_num;
+		else if (proto != inet->inet_num)
 			return(-EINVAL);
 
 		if (proto > 255)
@@ -799,7 +799,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
 		if (sk->sk_state != TCP_ESTABLISHED)
 			return -EDESTADDRREQ;
 
-		proto = inet->num;
+		proto = inet->inet_num;
 		daddr = &np->daddr;
 		fl.fl6_flowlabel = np->flow_label;
 	}
@@ -966,7 +966,7 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname,
 
 	switch (optname) {
 		case IPV6_CHECKSUM:
-			if (inet_sk(sk)->num == IPPROTO_ICMPV6 &&
+			if (inet_sk(sk)->inet_num == IPPROTO_ICMPV6 &&
 			    level == IPPROTO_IPV6) {
 				/*
 				 * RFC3542 tells that IPV6_CHECKSUM socket
@@ -1006,7 +1006,7 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname,
 			break;
 
 		case SOL_ICMPV6:
-			if (inet_sk(sk)->num != IPPROTO_ICMPV6)
+			if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
 				return -EOPNOTSUPP;
 			return rawv6_seticmpfilter(sk, level, optname, optval,
 						   optlen);
@@ -1029,7 +1029,7 @@ static int compat_rawv6_setsockopt(struct sock *sk, int level, int optname,
 	case SOL_RAW:
 		break;
 	case SOL_ICMPV6:
-		if (inet_sk(sk)->num != IPPROTO_ICMPV6)
+		if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
 			return -EOPNOTSUPP;
 		return rawv6_seticmpfilter(sk, level, optname, optval, optlen);
 	case SOL_IPV6:
@@ -1086,7 +1086,7 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname,
 			break;
 
 		case SOL_ICMPV6:
-			if (inet_sk(sk)->num != IPPROTO_ICMPV6)
+			if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
 				return -EOPNOTSUPP;
 			return rawv6_geticmpfilter(sk, level, optname, optval,
 						   optlen);
@@ -1109,7 +1109,7 @@ static int compat_rawv6_getsockopt(struct sock *sk, int level, int optname,
 	case SOL_RAW:
 		break;
 	case SOL_ICMPV6:
-		if (inet_sk(sk)->num != IPPROTO_ICMPV6)
+		if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
 			return -EOPNOTSUPP;
 		return rawv6_geticmpfilter(sk, level, optname, optval, optlen);
 	case SOL_IPV6:
@@ -1156,7 +1156,7 @@ static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg)
 
 static void rawv6_close(struct sock *sk, long timeout)
 {
-	if (inet_sk(sk)->num == IPPROTO_RAW)
+	if (inet_sk(sk)->inet_num == IPPROTO_RAW)
 		ip6_ra_control(sk, -1);
 	ip6mr_sk_done(sk);
 	sk_common_release(sk);
@@ -1175,7 +1175,7 @@ static int rawv6_init_sk(struct sock *sk)
 {
 	struct raw6_sock *rp = raw6_sk(sk);
 
-	switch (inet_sk(sk)->num) {
+	switch (inet_sk(sk)->inet_num) {
 	case IPPROTO_ICMPV6:
 		rp->checksum = 1;
 		rp->offset   = 2;
@@ -1225,7 +1225,7 @@ static void raw6_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
 	dest  = &np->daddr;
 	src   = &np->rcv_saddr;
 	destp = 0;
-	srcp  = inet_sk(sp)->num;
+	srcp  = inet_sk(sp)->inet_num;
 	seq_printf(seq,
 		   "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
 		   "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n",
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index cbe55e5d9f96..c46da533888a 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -254,7 +254,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 		fl.oif = sk->sk_bound_dev_if;
 		fl.mark = sk->sk_mark;
 		fl.fl_ip_dport = inet_rsk(req)->rmt_port;
-		fl.fl_ip_sport = inet_sk(sk)->sport;
+		fl.fl_ip_sport = inet_sk(sk)->inet_sport;
 		security_req_classify_flow(req, &fl);
 		if (ip6_dst_lookup(sk, &dst, &fl))
 			goto out_free;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 451763059142..c54ec3615ded 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -226,8 +226,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 #endif
 			goto failure;
 		} else {
-			ipv6_addr_set_v4mapped(inet->saddr, &np->saddr);
-			ipv6_addr_set_v4mapped(inet->rcv_saddr, &np->rcv_saddr);
+			ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr);
+			ipv6_addr_set_v4mapped(inet->inet_rcv_saddr,
+					       &np->rcv_saddr);
 		}
 
 		return err;
@@ -243,7 +244,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	fl.oif = sk->sk_bound_dev_if;
 	fl.mark = sk->sk_mark;
 	fl.fl_ip_dport = usin->sin6_port;
-	fl.fl_ip_sport = inet->sport;
+	fl.fl_ip_sport = inet->inet_sport;
 
 	if (np->opt && np->opt->srcrt) {
 		struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
@@ -275,7 +276,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 
 	/* set the source address */
 	ipv6_addr_copy(&np->saddr, saddr);
-	inet->rcv_saddr = LOOPBACK4_IPV6;
+	inet->inet_rcv_saddr = LOOPBACK4_IPV6;
 
 	sk->sk_gso_type = SKB_GSO_TCPV6;
 	__ip6_dst_store(sk, dst, NULL, NULL);
@@ -287,7 +288,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 
 	tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
 
-	inet->dport = usin->sin6_port;
+	inet->inet_dport = usin->sin6_port;
 
 	tcp_set_state(sk, TCP_SYN_SENT);
 	err = inet6_hash_connect(&tcp_death_row, sk);
@@ -297,8 +298,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	if (!tp->write_seq)
 		tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32,
 							     np->daddr.s6_addr32,
-							     inet->sport,
-							     inet->dport);
+							     inet->inet_sport,
+							     inet->inet_dport);
 
 	err = tcp_connect(sk);
 	if (err)
@@ -310,7 +311,7 @@ late_failure:
 	tcp_set_state(sk, TCP_CLOSE);
 	__sk_dst_reset(sk);
 failure:
-	inet->dport = 0;
+	inet->inet_dport = 0;
 	sk->sk_route_caps = 0;
 	return err;
 }
@@ -383,8 +384,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			ipv6_addr_copy(&fl.fl6_src, &np->saddr);
 			fl.oif = sk->sk_bound_dev_if;
 			fl.mark = sk->sk_mark;
-			fl.fl_ip_dport = inet->dport;
-			fl.fl_ip_sport = inet->sport;
+			fl.fl_ip_dport = inet->inet_dport;
+			fl.fl_ip_sport = inet->inet_sport;
 			security_skb_classify_flow(skb, &fl);
 
 			if ((err = ip6_dst_lookup(sk, &dst, &fl))) {
@@ -1291,9 +1292,9 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 		memcpy(newnp, np, sizeof(struct ipv6_pinfo));
 
-		ipv6_addr_set_v4mapped(newinet->daddr, &newnp->daddr);
+		ipv6_addr_set_v4mapped(newinet->inet_daddr, &newnp->daddr);
 
-		ipv6_addr_set_v4mapped(newinet->saddr, &newnp->saddr);
+		ipv6_addr_set_v4mapped(newinet->inet_saddr, &newnp->saddr);
 
 		ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr);
 
@@ -1431,7 +1432,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
 	tcp_initialize_rcv_mss(newsk);
 
-	newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6;
+	newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
+	newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
 
 #ifdef CONFIG_TCP_MD5SIG
 	/* Copy over the MD5 key from the original socket */
@@ -1931,8 +1933,8 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
 
 	dest  = &np->daddr;
 	src   = &np->rcv_saddr;
-	destp = ntohs(inet->dport);
-	srcp  = ntohs(inet->sport);
+	destp = ntohs(inet->inet_dport);
+	srcp  = ntohs(inet->inet_sport);
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
 		timer_active	= 1;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index b86425b7ea22..829d300a6f35 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -53,7 +53,7 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
 {
 	const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr;
 	const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
-	__be32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr;
+	__be32 sk1_rcv_saddr = inet_sk(sk)->inet_rcv_saddr;
 	__be32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
 	int sk_ipv6only = ipv6_only_sock(sk);
 	int sk2_ipv6only = inet_v6_ipv6only(sk2);
@@ -63,8 +63,8 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
 	/* if both are mapped, treat as IPv4 */
 	if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED)
 		return (!sk2_ipv6only &&
-			(!sk_rcv_saddr || !sk2_rcv_saddr ||
-			  sk_rcv_saddr == sk2_rcv_saddr));
+			(!sk1_rcv_saddr || !sk2_rcv_saddr ||
+			  sk1_rcv_saddr == sk2_rcv_saddr));
 
 	if (addr_type2 == IPV6_ADDR_ANY &&
 	    !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
@@ -100,8 +100,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
 		struct inet_sock *inet = inet_sk(sk);
 
 		score = 0;
-		if (inet->dport) {
-			if (inet->dport != sport)
+		if (inet->inet_dport) {
+			if (inet->inet_dport != sport)
 				return -1;
 			score++;
 		}
@@ -417,8 +417,8 @@ static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk,
 
 		if (s->sk_hash == num && s->sk_family == PF_INET6) {
 			struct ipv6_pinfo *np = inet6_sk(s);
-			if (inet->dport) {
-				if (inet->dport != rmt_port)
+			if (inet->inet_dport) {
+				if (inet->inet_dport != rmt_port)
 					continue;
 			}
 			if (!ipv6_addr_any(&np->daddr) &&
@@ -792,7 +792,7 @@ int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk,
 		if (ipv6_addr_v4mapped(daddr)) {
 			struct sockaddr_in sin;
 			sin.sin_family = AF_INET;
-			sin.sin_port = sin6 ? sin6->sin6_port : inet->dport;
+			sin.sin_port = sin6 ? sin6->sin6_port : inet->inet_dport;
 			sin.sin_addr.s_addr = daddr->s6_addr32[3];
 			msg->msg_name = &sin;
 			msg->msg_namelen = sizeof(sin);
@@ -865,7 +865,7 @@ do_udp_sendmsg:
 		if (sk->sk_state != TCP_ESTABLISHED)
 			return -EDESTADDRREQ;
 
-		fl.fl_ip_dport = inet->dport;
+		fl.fl_ip_dport = inet->inet_dport;
 		daddr = &np->daddr;
 		fl.fl6_flowlabel = np->flow_label;
 		connected = 1;
@@ -911,7 +911,7 @@ do_udp_sendmsg:
 		fl.fl6_dst.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
 	if (ipv6_addr_any(&fl.fl6_src) && !ipv6_addr_any(&np->saddr))
 		ipv6_addr_copy(&fl.fl6_src, &np->saddr);
-	fl.fl_ip_sport = inet->sport;
+	fl.fl_ip_sport = inet->inet_sport;
 
 	/* merge ip6_build_xmit from ip6_output */
 	if (opt && opt->srcrt) {
@@ -1192,8 +1192,8 @@ static void udp6_sock_seq_show(struct seq_file *seq, struct sock *sp, int bucket
 
 	dest  = &np->daddr;
 	src   = &np->rcv_saddr;
-	destp = ntohs(inet->dport);
-	srcp  = ntohs(inet->sport);
+	destp = ntohs(inet->inet_dport);
+	srcp  = ntohs(inet->inet_sport);
 	seq_printf(seq,
 		   "%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
 		   "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n",
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index ebf00ad5b194..362afbd60a96 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -149,7 +149,7 @@ socket_match(const struct sk_buff *skb, const struct xt_match_param *par,
 
 		/* Ignore sockets listening on INADDR_ANY */
 		wildcard = (sk->sk_state != TCP_TIME_WAIT &&
-			    inet_sk(sk)->rcv_saddr == 0);
+			    inet_sk(sk)->inet_rcv_saddr == 0);
 
 		/* Ignore non-transparent sockets,
 		   if XT_SOCKET_TRANSPARENT is used */
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 24b743eb0b1b..45474a436862 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -67,11 +67,11 @@ static int rds_tcp_accept_one(struct socket *sock)
 	inet = inet_sk(new_sock->sk);
 
 	rdsdebug("accepted tcp %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u\n",
-		  NIPQUAD(inet->saddr), ntohs(inet->sport),
-		  NIPQUAD(inet->daddr), ntohs(inet->dport));
+		  NIPQUAD(inet->inet_saddr), ntohs(inet->inet_sport),
+		  NIPQUAD(inet->inet_daddr), ntohs(inet->inet_dport));
 
-	conn = rds_conn_create(inet->saddr, inet->daddr, &rds_tcp_transport,
-			       GFP_KERNEL);
+	conn = rds_conn_create(inet->inet_saddr, inet->inet_daddr,
+			       &rds_tcp_transport, GFP_KERNEL);
 	if (IS_ERR(conn)) {
 		ret = PTR_ERR(conn);
 		goto out;
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 612dc878e05c..d9f4cc2c7869 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -296,19 +296,19 @@ static void sctp_v4_from_sk(union sctp_addr *addr, struct sock *sk)
 {
 	addr->v4.sin_family = AF_INET;
 	addr->v4.sin_port = 0;
-	addr->v4.sin_addr.s_addr = inet_sk(sk)->rcv_saddr;
+	addr->v4.sin_addr.s_addr = inet_sk(sk)->inet_rcv_saddr;
 }
 
 /* Initialize sk->sk_rcv_saddr from sctp_addr. */
 static void sctp_v4_to_sk_saddr(union sctp_addr *addr, struct sock *sk)
 {
-	inet_sk(sk)->rcv_saddr = addr->v4.sin_addr.s_addr;
+	inet_sk(sk)->inet_rcv_saddr = addr->v4.sin_addr.s_addr;
 }
 
 /* Initialize sk->sk_daddr from sctp_addr. */
 static void sctp_v4_to_sk_daddr(union sctp_addr *addr, struct sock *sk)
 {
-	inet_sk(sk)->daddr = addr->v4.sin_addr.s_addr;
+	inet_sk(sk)->inet_daddr = addr->v4.sin_addr.s_addr;
 }
 
 /* Initialize a sctp_addr from an address parameter. */
@@ -598,7 +598,7 @@ static struct sock *sctp_v4_create_accept_sk(struct sock *sk,
 
 	newinet = inet_sk(newsk);
 
-	newinet->daddr = asoc->peer.primary_addr.v4.sin_addr.s_addr;
+	newinet->inet_daddr = asoc->peer.primary_addr.v4.sin_addr.s_addr;
 
 	sk_refcnt_debug_inc(newsk);
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 0970e92c6acd..4085db99033d 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -394,7 +394,7 @@ SCTP_STATIC int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
 
 	/* Refresh ephemeral port.  */
 	if (!bp->port)
-		bp->port = inet_sk(sk)->num;
+		bp->port = inet_sk(sk)->inet_num;
 
 	/* Add the address to the bind address list.
 	 * Use GFP_ATOMIC since BHs will be disabled.
@@ -403,7 +403,7 @@ SCTP_STATIC int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
 
 	/* Copy back into socket for getsockname() use. */
 	if (!ret) {
-		inet_sk(sk)->sport = htons(inet_sk(sk)->num);
+		inet_sk(sk)->inet_sport = htons(inet_sk(sk)->inet_num);
 		af->to_sk_saddr(addr, sk);
 	}
 
@@ -1115,7 +1115,7 @@ static int __sctp_connect(struct sock* sk,
 	}
 
 	/* Initialize sk's dport and daddr for getpeername() */
-	inet_sk(sk)->dport = htons(asoc->peer.port);
+	inet_sk(sk)->inet_dport = htons(asoc->peer.port);
 	af = sctp_get_af_specific(sa_addr->sa.sa_family);
 	af->to_sk_daddr(sa_addr, sk);
 	sk->sk_err = 0;
@@ -5851,7 +5851,7 @@ pp_not_found:
 	 */
 success:
 	if (!sctp_sk(sk)->bind_hash) {
-		inet_sk(sk)->num = snum;
+		inet_sk(sk)->inet_num = snum;
 		sk_add_bind_node(sk, &pp->owner);
 		sctp_sk(sk)->bind_hash = pp;
 	}
@@ -5923,7 +5923,7 @@ SCTP_STATIC int sctp_listen_start(struct sock *sk, int backlog)
 		if (sctp_autobind(sk))
 			return -EAGAIN;
 	} else {
-		if (sctp_get_port(sk, inet_sk(sk)->num)) {
+		if (sctp_get_port(sk, inet_sk(sk)->inet_num)) {
 			sk->sk_state = SCTP_SS_CLOSED;
 			return -EADDRINUSE;
 		}
@@ -6094,14 +6094,14 @@ static void sctp_bucket_destroy(struct sctp_bind_bucket *pp)
 static inline void __sctp_put_port(struct sock *sk)
 {
 	struct sctp_bind_hashbucket *head =
-		&sctp_port_hashtable[sctp_phashfn(inet_sk(sk)->num)];
+		&sctp_port_hashtable[sctp_phashfn(inet_sk(sk)->inet_num)];
 	struct sctp_bind_bucket *pp;
 
 	sctp_spin_lock(&head->lock);
 	pp = sctp_sk(sk)->bind_hash;
 	__sk_del_bind_node(sk);
 	sctp_sk(sk)->bind_hash = NULL;
-	inet_sk(sk)->num = 0;
+	inet_sk(sk)->inet_num = 0;
 	sctp_bucket_destroy(pp);
 	sctp_spin_unlock(&head->lock);
 }
@@ -6128,7 +6128,7 @@ static int sctp_autobind(struct sock *sk)
 	/* Initialize a local sockaddr structure to INADDR_ANY. */
 	af = sctp_sk(sk)->pf->af;
 
-	port = htons(inet_sk(sk)->num);
+	port = htons(inet_sk(sk)->inet_num);
 	af->inaddr_any(&autoaddr, port);
 
 	return sctp_do_bind(sk, &autoaddr, af->sockaddr_len);
@@ -6697,12 +6697,12 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 	/* Initialize sk's sport, dport, rcv_saddr and daddr for
 	 * getsockname() and getpeername()
 	 */
-	newinet->sport = inet->sport;
-	newinet->saddr = inet->saddr;
-	newinet->rcv_saddr = inet->rcv_saddr;
-	newinet->dport = htons(asoc->peer.port);
+	newinet->inet_sport = inet->inet_sport;
+	newinet->inet_saddr = inet->inet_saddr;
+	newinet->inet_rcv_saddr = inet->inet_rcv_saddr;
+	newinet->inet_dport = htons(asoc->peer.port);
 	newinet->pmtudisc = inet->pmtudisc;
-	newinet->id = asoc->next_tsn ^ jiffies;
+	newinet->inet_id = asoc->next_tsn ^ jiffies;
 
 	newinet->uc_ttl = inet->uc_ttl;
 	newinet->mc_loop = 1;
@@ -6741,13 +6741,13 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 	newsp->hmac = NULL;
 
 	/* Hook this new socket in to the bind_hash list. */
-	head = &sctp_port_hashtable[sctp_phashfn(inet_sk(oldsk)->num)];
+	head = &sctp_port_hashtable[sctp_phashfn(inet_sk(oldsk)->inet_num)];
 	sctp_local_bh_disable();
 	sctp_spin_lock(&head->lock);
 	pp = sctp_sk(oldsk)->bind_hash;
 	sk_add_bind_node(newsk, &pp->owner);
 	sctp_sk(newsk)->bind_hash = pp;
-	inet_sk(newsk)->num = inet_sk(oldsk)->num;
+	inet_sk(newsk)->inet_num = inet_sk(oldsk)->inet_num;
 	sctp_spin_unlock(&head->lock);
 	sctp_local_bh_enable();
 
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index ccc5e83cae5d..c2a17876defd 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -272,14 +272,14 @@ static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining)
 	case PF_INET:
 		len = snprintf(buf, remaining, "ipv4 %s %pI4 %d\n",
 				proto_name,
-				&inet_sk(sk)->rcv_saddr,
-				inet_sk(sk)->num);
+				&inet_sk(sk)->inet_rcv_saddr,
+				inet_sk(sk)->inet_num);
 		break;
 	case PF_INET6:
 		len = snprintf(buf, remaining, "ipv6 %s %pI6 %d\n",
 				proto_name,
 				&inet6_sk(sk)->rcv_saddr,
-				inet_sk(sk)->num);
+				inet_sk(sk)->inet_num);
 		break;
 	default:
 		len = snprintf(buf, remaining, "*unknown-%d*\n",
@@ -1311,7 +1311,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 	/* Register socket with portmapper */
 	if (*errp >= 0 && pmap_register)
 		*errp = svc_register(serv, inet->sk_family, inet->sk_protocol,
-				     ntohs(inet_sk(inet)->sport));
+				     ntohs(inet_sk(inet)->inet_sport));
 
 	if (*errp < 0) {
 		kfree(svsk);
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index 3bb90b6f1dd3..e04566a2c4e5 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -273,11 +273,11 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 			case AF_INET: {
 				struct inet_sock *inet = inet_sk(sk);
 
-				print_ipv4_addr(ab, inet->rcv_saddr,
-						inet->sport,
+				print_ipv4_addr(ab, inet->inet_rcv_saddr,
+						inet->inet_sport,
 						"laddr", "lport");
-				print_ipv4_addr(ab, inet->daddr,
-						inet->dport,
+				print_ipv4_addr(ab, inet->inet_daddr,
+						inet->inet_dport,
 						"faddr", "fport");
 				break;
 			}
@@ -286,10 +286,10 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 				struct ipv6_pinfo *inet6 = inet6_sk(sk);
 
 				print_ipv6_addr(ab, &inet6->rcv_saddr,
-						inet->sport,
+						inet->inet_sport,
 						"laddr", "lport");
 				print_ipv6_addr(ab, &inet6->daddr,
-						inet->dport,
+						inet->inet_dport,
 						"faddr", "fport");
 				break;
 			}
-- 
cgit v1.2.3


From 4c2b1a11646bf74e2926ce8b13a21884adc1e05c Mon Sep 17 00:00:00 2001
From: Inaky Perez-Gonzalez <inaky@linux.intel.com>
Date: Wed, 2 Sep 2009 15:36:05 -0700
Subject: wimax: allow specifying debug levels as command line option

Add "debug" module options to all the wimax modules (including
drivers) so that the debug levels can be set upon kernel boot or
module load time.

This is needed as currently there was a limitation where the debug
levels could only be set when a device was succesfully
enumerated. This made it difficult to debug issues that made a device
not probe properly.

Signed-off-by: Inaky Perez-Gonzalez <inaky@linux.intel.com>
---
 drivers/net/wimax/i2400m/driver.c | 10 ++++++
 drivers/net/wimax/i2400m/sdio.c   | 10 ++++++
 drivers/net/wimax/i2400m/usb.c    |  9 +++++
 include/linux/wimax/debug.h       | 72 +++++++++++++++++++++++++++++++++++++++
 net/wimax/stack.c                 | 11 ++++++
 5 files changed, 112 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/wimax/i2400m/driver.c b/drivers/net/wimax/i2400m/driver.c
index 7ba00de5dd9b..e3b2c246cad7 100644
--- a/drivers/net/wimax/i2400m/driver.c
+++ b/drivers/net/wimax/i2400m/driver.c
@@ -90,6 +90,14 @@ MODULE_PARM_DESC(power_save_disabled,
 		 "False by default (so the device is told to do power "
 		 "saving).");
 
+static char i2400m_debug_params[128];
+module_param_string(debug, i2400m_debug_params, sizeof(i2400m_debug_params),
+		    0644);
+MODULE_PARM_DESC(debug,
+		 "String of space-separated NAME:VALUE pairs, where NAMEs "
+		 "are the different debug submodules and VALUE are the "
+		 "initial debug value to set.");
+
 /**
  * i2400m_queue_work - schedule work on a i2400m's queue
  *
@@ -794,6 +802,8 @@ size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL);
 static
 int __init i2400m_driver_init(void)
 {
+	d_parse_params(D_LEVEL, D_LEVEL_SIZE, i2400m_debug_params,
+		       "i2400m.debug");
 	return 0;
 }
 module_init(i2400m_driver_init);
diff --git a/drivers/net/wimax/i2400m/sdio.c b/drivers/net/wimax/i2400m/sdio.c
index 7c1b843b63e9..2d2cc5ac6d86 100644
--- a/drivers/net/wimax/i2400m/sdio.c
+++ b/drivers/net/wimax/i2400m/sdio.c
@@ -71,6 +71,14 @@
 static int ioe_timeout = 2;
 module_param(ioe_timeout, int, 0);
 
+static char i2400ms_debug_params[128];
+module_param_string(debug, i2400ms_debug_params, sizeof(i2400ms_debug_params),
+		    0644);
+MODULE_PARM_DESC(debug,
+		 "String of space-separated NAME:VALUE pairs, where NAMEs "
+		 "are the different debug submodules and VALUE are the "
+		 "initial debug value to set.");
+
 /* Our firmware file name list */
 static const char *i2400ms_bus_fw_names[] = {
 #define I2400MS_FW_FILE_NAME "i2400m-fw-sdio-1.3.sbcf"
@@ -559,6 +567,8 @@ struct sdio_driver i2400m_sdio_driver = {
 static
 int __init i2400ms_driver_init(void)
 {
+	d_parse_params(D_LEVEL, D_LEVEL_SIZE, i2400ms_debug_params,
+		       "i2400m_sdio.debug");
 	return sdio_register_driver(&i2400m_sdio_driver);
 }
 module_init(i2400ms_driver_init);
diff --git a/drivers/net/wimax/i2400m/usb.c b/drivers/net/wimax/i2400m/usb.c
index a5879e21bbf3..063422290a43 100644
--- a/drivers/net/wimax/i2400m/usb.c
+++ b/drivers/net/wimax/i2400m/usb.c
@@ -71,6 +71,13 @@
 #define D_SUBMODULE usb
 #include "usb-debug-levels.h"
 
+static char i2400mu_debug_params[128];
+module_param_string(debug, i2400mu_debug_params, sizeof(i2400mu_debug_params),
+		    0644);
+MODULE_PARM_DESC(debug,
+		 "String of space-separated NAME:VALUE pairs, where NAMEs "
+		 "are the different debug submodules and VALUE are the "
+		 "initial debug value to set.");
 
 /* Our firmware file name */
 static const char *i2400mu_bus_fw_names[] = {
@@ -633,6 +640,8 @@ struct usb_driver i2400mu_driver = {
 static
 int __init i2400mu_driver_init(void)
 {
+	d_parse_params(D_LEVEL, D_LEVEL_SIZE, i2400mu_debug_params,
+		       "i2400m_usb.debug");
 	return usb_register(&i2400mu_driver);
 }
 module_init(i2400mu_driver_init);
diff --git a/include/linux/wimax/debug.h b/include/linux/wimax/debug.h
index c703e0340423..db8096e88533 100644
--- a/include/linux/wimax/debug.h
+++ b/include/linux/wimax/debug.h
@@ -450,4 +450,76 @@ do {							\
 })
 
 
+static inline
+void d_submodule_set(struct d_level *d_level, size_t d_level_size,
+		     const char *submodule, u8 level, const char *tag)
+{
+	struct d_level *itr, *top;
+	int index = -1;
+
+	for (itr = d_level, top = itr + d_level_size; itr < top; itr++) {
+		index++;
+		if (itr->name == NULL) {
+			printk(KERN_ERR "%s: itr->name NULL?? (%p, #%d)\n",
+			       tag, itr, index);
+			continue;
+		}
+		if (!strcmp(itr->name, submodule)) {
+			itr->level = level;
+			return;
+		}
+	}
+	printk(KERN_ERR "%s: unknown submodule %s\n", tag, submodule);
+}
+
+
+/**
+ * d_parse_params - Parse a string with debug parameters from the
+ * command line
+ *
+ * @d_level: level structure (D_LEVEL)
+ * @d_level_size: number of items in the level structure
+ *     (D_LEVEL_SIZE).
+ * @_params: string with the parameters; this is a space (not tab!)
+ *     separated list of NAME:VALUE, where value is the debug level
+ *     and NAME is the name of the submodule.
+ * @tag: string for error messages (example: MODULE.ARGNAME).
+ */
+static inline
+void d_parse_params(struct d_level *d_level, size_t d_level_size,
+		    const char *_params, const char *tag)
+{
+	char submodule[130], *params, *params_orig, *token, *colon;
+	unsigned level, tokens;
+
+	if (_params == NULL)
+		return;
+	params_orig = kstrdup(_params, GFP_KERNEL);
+	params = params_orig;
+	while (1) {
+		token = strsep(&params, " ");
+		if (token == NULL)
+			break;
+		if (*token == '\0')	/* eat joint spaces */
+			continue;
+		/* kernel's sscanf %s eats until whitespace, so we
+		 * replace : by \n so it doesn't get eaten later by
+		 * strsep */
+		colon = strchr(token, ':');
+		if (colon != NULL)
+			*colon = '\n';
+		tokens = sscanf(token, "%s\n%u", submodule, &level);
+		if (colon != NULL)
+			*colon = ':';	/* set back, for error messages */
+		if (tokens == 2)
+			d_submodule_set(d_level, d_level_size,
+					submodule, level, tag);
+		else
+			printk(KERN_ERR "%s: can't parse '%s' as a "
+			       "SUBMODULE:LEVEL (%d tokens)\n",
+			       tag, token, tokens);
+	}
+	kfree(params_orig);
+}
+
 #endif /* #ifndef __debug__h__ */
diff --git a/net/wimax/stack.c b/net/wimax/stack.c
index 79fb7d7c640f..c8866412f830 100644
--- a/net/wimax/stack.c
+++ b/net/wimax/stack.c
@@ -60,6 +60,14 @@
 #define D_SUBMODULE stack
 #include "debug-levels.h"
 
+static char wimax_debug_params[128];
+module_param_string(debug, wimax_debug_params, sizeof(wimax_debug_params),
+		    0644);
+MODULE_PARM_DESC(debug,
+		 "String of space-separated NAME:VALUE pairs, where NAMEs "
+		 "are the different debug submodules and VALUE are the "
+		 "initial debug value to set.");
+
 /*
  * Authoritative source for the RE_STATE_CHANGE attribute policy
  *
@@ -562,6 +570,9 @@ int __init wimax_subsys_init(void)
 	int result, cnt;
 
 	d_fnstart(4, NULL, "()\n");
+	d_parse_params(D_LEVEL, D_LEVEL_SIZE, wimax_debug_params,
+		       "wimax.debug");
+
 	snprintf(wimax_gnl_family.name, sizeof(wimax_gnl_family.name),
 		 "WiMAX");
 	result = genl_register_family(&wimax_gnl_family);
-- 
cgit v1.2.3


From 32742e6158657f19ad31653705bef56d983508e7 Mon Sep 17 00:00:00 2001
From: Inaky Perez-Gonzalez <inaky@linux.intel.com>
Date: Thu, 3 Sep 2009 15:56:40 -0700
Subject: wimax/i2400m: decide properly if using signed vs non-signed firmware
 loading

The i2400m based devices can boot two main types of firmware images:
signed and non-signed. Signed images have signature data included that
must match that of a certificate stored in the device.

Currently the code is making the decission on what type of firmware
load (signed vs non-signed) is going to be loaded based on a hardcoded
decission in __i2400m_ack_verify(), based on the barker the device
sent upon boot.

This is not flexible enough as future hardware will emit more barkers;
thus the bit has to be set in a place where there is better knowledge
of what is going on. This will be done in follow-up commits -- however
this patch paves the way for it.

So the querying of the mode is packed into i2400m_boot_is_signed();
the main changes are just using i2400m_boot_is_signed() to determine
the method to follow and setting i2400m->sboot in
i2400m_is_boot_barker(). The modifications in i2400m_dnload_init() and
i2400m_dnload_finalize() are just reorganizing the order of the if
blocks and thus look larger than they really are.

Signed-off-by: Inaky Perez-Gonzalez <inaky@linux.intel.com>
---
 drivers/net/wimax/i2400m/fw.c | 35 +++++++++++++++++++++--------------
 include/linux/wimax/i2400m.h  | 10 ----------
 2 files changed, 21 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wimax/i2400m/fw.c b/drivers/net/wimax/i2400m/fw.c
index 92d4d605dc2d..c962a8d8df7e 100644
--- a/drivers/net/wimax/i2400m/fw.c
+++ b/drivers/net/wimax/i2400m/fw.c
@@ -508,6 +508,17 @@ error_send:
 }
 
 
+/*
+ * Indicate if the device emitted a reboot barker that indicates
+ * "signed boot"
+ */
+static
+unsigned i2400m_boot_is_signed(struct i2400m *i2400m)
+{
+	return likely(i2400m->sboot);
+}
+
+
 /*
  * Do the final steps of uploading firmware
  *
@@ -529,7 +540,7 @@ int i2400m_dnload_finalize(struct i2400m *i2400m,
 
 	d_fnstart(3, dev, "offset %zu\n", offset);
 	cmd = (void *) bcf + offset;
-	if (i2400m->sboot == 0) {
+	if (i2400m_boot_is_signed(i2400m) == 0) {
 		struct i2400m_bootrom_header jump_ack;
 		d_printf(1, dev, "unsecure boot, jumping to 0x%08x\n",
 			le32_to_cpu(cmd->target_addr));
@@ -846,28 +857,24 @@ int i2400m_dnload_init(struct i2400m *i2400m, const struct i2400m_bcf_hdr *bcf)
 {
 	int result;
 	struct device *dev = i2400m_dev(i2400m);
-	u32 module_id = le32_to_cpu(bcf->module_id);
 
-	if (i2400m->sboot == 0
-	    && (module_id & I2400M_BCF_MOD_ID_POKES) == 0) {
-		/* non-signed boot process without pokes */
-		result = i2400m_dnload_init_nonsigned(i2400m);
+	if (i2400m_boot_is_signed(i2400m)) {
+		d_printf(1, dev, "signed boot\n");
+		result = i2400m_dnload_init_signed(i2400m, bcf);
 		if (result == -ERESTARTSYS)
 			return result;
 		if (result < 0)
-			dev_err(dev, "fw %s: non-signed download "
+			dev_err(dev, "firmware %s: signed boot download "
 				"initialization failed: %d\n",
 				i2400m->fw_name, result);
-	} else if (i2400m->sboot == 0
-		 && (module_id & I2400M_BCF_MOD_ID_POKES)) {
-		/* non-signed boot process with pokes, nothing to do */
-		result = 0;
-	} else {		 /* signed boot process */
-		result = i2400m_dnload_init_signed(i2400m, bcf);
+	} else {
+		/* non-signed boot process without pokes */
+		d_printf(1, dev, "non-signed boot\n");
+		result = i2400m_dnload_init_nonsigned(i2400m);
 		if (result == -ERESTARTSYS)
 			return result;
 		if (result < 0)
-			dev_err(dev, "fw %s: signed boot download "
+			dev_err(dev, "firmware %s: non-signed download "
 				"initialization failed: %d\n",
 				i2400m->fw_name, result);
 	}
diff --git a/include/linux/wimax/i2400m.h b/include/linux/wimax/i2400m.h
index 433693ef2bb0..d6e2a3595682 100644
--- a/include/linux/wimax/i2400m.h
+++ b/include/linux/wimax/i2400m.h
@@ -168,16 +168,6 @@ enum i2400m_brh {
 };
 
 
-/* Constants for bcf->module_id */
-enum i2400m_bcf_mod_id {
-	/* Firmware file carries its own pokes -- pokes are a set of
-	 * magical values that have to be written in certain memory
-	 * addresses to get the device up and ready for firmware
-	 * download when it is in non-signed boot mode. */
-	I2400M_BCF_MOD_ID_POKES = 0x000000001,
-};
-
-
 /**
  * i2400m_bootrom_header - Header for a boot-mode command
  *
-- 
cgit v1.2.3


From 923d708fed9d47c7b4d67694500d766337663e29 Mon Sep 17 00:00:00 2001
From: Inaky Perez-Gonzalez <inaky@linux.intel.com>
Date: Fri, 4 Sep 2009 14:50:59 -0700
Subject: wimax/i2400m: fix reboot echo/ack barker deadlock

The i2400m based devices can get in a sort of a deadlock some times;
when they boot, they send a reboot "barker" (a magic number) and then
the driver has to echo that same barker to ack reception
(echo/ack). Then the device does a final ack by sending an ACK barker.

The first time this happens, we don't know ahead of time with barker
the device is going to send, as different device models and SKUs will
send different barker depending on the EEPROM programming.

If the device has sent the barker before the driver has been able to
read it, the driver looses, as it doesn't know which barker it has to
echo/ack back. With older devices, we tried a couple of combinations
and that always worked; but now, with adding support for more, in
which we have an unlimited number of new barkers, that is not an
option.

So we rework said case so that when the device gets stuck, we just
cycle through all the known types until one forces the device to send
an ack. Otherwise, the driver gives up and aborts.

Signed-off-by: Inaky Perez-Gonzalez <inaky@linux.intel.com>
---
 drivers/net/wimax/i2400m/fw.c | 63 ++++++++++++++++++++++++++++++++++---------
 include/linux/wimax/i2400m.h  |  2 +-
 2 files changed, 52 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wimax/i2400m/fw.c b/drivers/net/wimax/i2400m/fw.c
index 55fe011a9633..eef236d85af3 100644
--- a/drivers/net/wimax/i2400m/fw.c
+++ b/drivers/net/wimax/i2400m/fw.c
@@ -812,7 +812,7 @@ int i2400m_dnload_finalize(struct i2400m *i2400m,
  *
  * @i2400m: device descriptor
  * @flags:
- *      I2400M_BRI_SOFT: a reboot notification has been seen
+ *      I2400M_BRI_SOFT: a reboot barker has been seen
  *          already, so don't wait for it.
  *
  *      I2400M_BRI_NO_REBOOT: Don't send a reboot command, but wait
@@ -829,8 +829,9 @@ int i2400m_dnload_finalize(struct i2400m *i2400m,
  * main phases to this:
  *
  * a. (1) send a reboot command and (2) get a reboot barker
- * b. (1) ack the reboot sending a reboot barker and (2) getting an
- *        ack barker in return
+ *
+ * b. (1) echo/ack the reboot sending the reboot barker back and (2)
+ *        getting an ack barker in return
  *
  * We want to skip (a) in some cases [soft]. The state machine is
  * horrible, but it is basically: on each phase, send what has to be
@@ -838,6 +839,16 @@ int i2400m_dnload_finalize(struct i2400m *i2400m,
  * have to backtrack and retry, so we keep a max tries counter for
  * that.
  *
+ * It sucks because we don't know ahead of time which is going to be
+ * the reboot barker (the device might send different ones depending
+ * on its EEPROM config) and once the device reboots and waits for the
+ * echo/ack reboot barker being sent back, it doesn't understand
+ * anything else. So we can be left at the point where we don't know
+ * what to send to it -- cold reset and bus reset seem to have little
+ * effect. So the function iterates (in this case) through all the
+ * known barkers and tries them all until an ACK is
+ * received. Otherwise, it gives up.
+ *
  * If we get a timeout after sending a warm reset, we do it again.
  */
 int i2400m_bootrom_init(struct i2400m *i2400m, enum i2400m_bri flags)
@@ -848,6 +859,7 @@ int i2400m_bootrom_init(struct i2400m *i2400m, enum i2400m_bri flags)
 	struct i2400m_bootrom_header ack;
 	int count = i2400m->bus_bm_retries;
 	int ack_timeout_cnt = 1;
+	unsigned i;
 
 	BUILD_BUG_ON(sizeof(*cmd) != sizeof(i2400m_barker_db[0].data));
 	BUILD_BUG_ON(sizeof(ack) != sizeof(i2400m_ACK_BARKER));
@@ -858,6 +870,7 @@ int i2400m_bootrom_init(struct i2400m *i2400m, enum i2400m_bri flags)
 	if (flags & I2400M_BRI_SOFT)
 		goto do_reboot_ack;
 do_reboot:
+	ack_timeout_cnt = 1;
 	if (--count < 0)
 		goto error_timeout;
 	d_printf(4, dev, "device reboot: reboot command [%d # left]\n",
@@ -869,22 +882,47 @@ do_reboot:
 	flags &= ~I2400M_BRI_NO_REBOOT;
 	switch (result) {
 	case -ERESTARTSYS:
+		/*
+		 * at this point, i2400m_bm_cmd(), through
+		 * __i2400m_bm_ack_process(), has updated
+		 * i2400m->barker and we are good to go.
+		 */
 		d_printf(4, dev, "device reboot: got reboot barker\n");
 		break;
 	case -EISCONN:	/* we don't know how it got here...but we follow it */
 		d_printf(4, dev, "device reboot: got ack barker - whatever\n");
 		goto do_reboot;
-	case -ETIMEDOUT:	/* device has timed out, we might be in boot
-				 * mode already and expecting an ack, let's try
-				 * that */
-		if (i2400m->barker == NULL) {
-			dev_info(dev, "warm reset timed out, unknown barker "
-				 "type, rebooting\n");
-			goto do_reboot;
-		} else {
-			dev_info(dev, "warm reset timed out, trying an ack\n");
+	case -ETIMEDOUT:
+		/*
+		 * Device has timed out, we might be in boot mode
+		 * already and expecting an ack; if we don't know what
+		 * the barker is, we just send them all. Cold reset
+		 * and bus reset don't work. Beats me.
+		 */
+		if (i2400m->barker != NULL) {
+			dev_err(dev, "device boot: reboot barker timed out, "
+				"trying (set) %08x echo/ack\n",
+				le32_to_cpu(i2400m->barker->data[0]));
 			goto do_reboot_ack;
 		}
+		for (i = 0; i < i2400m_barker_db_used; i++) {
+			struct i2400m_barker_db *barker = &i2400m_barker_db[i];
+			memcpy(cmd, barker->data, sizeof(barker->data));
+			result = i2400m_bm_cmd(i2400m, cmd, sizeof(*cmd),
+					       &ack, sizeof(ack),
+					       I2400M_BM_CMD_RAW);
+			if (result == -EISCONN) {
+				dev_warn(dev, "device boot: got ack barker "
+					 "after sending echo/ack barker "
+					 "#%d/%08x; rebooting j.i.c.\n",
+					 i, le32_to_cpu(barker->data[0]));
+				flags &= ~I2400M_BRI_NO_REBOOT;
+				goto do_reboot;
+			}
+		}
+		dev_err(dev, "device boot: tried all the echo/acks, could "
+			"not get device to respond; giving up");
+		result = -ESHUTDOWN;
 	case -EPROTO:
 	case -ESHUTDOWN:	/* dev is gone */
 	case -EINTR:		/* user cancelled */
@@ -892,6 +930,7 @@ do_reboot:
 	default:
 		dev_err(dev, "device reboot: error %d while waiting "
 			"for reboot barker - rebooting\n", result);
+		d_dump(1, dev, &ack, result);
 		goto do_reboot;
 	}
 	/* At this point we ack back with 4 REBOOT barkers and expect
diff --git a/include/linux/wimax/i2400m.h b/include/linux/wimax/i2400m.h
index d6e2a3595682..fd5af05083cb 100644
--- a/include/linux/wimax/i2400m.h
+++ b/include/linux/wimax/i2400m.h
@@ -138,7 +138,7 @@ struct i2400m_bcf_hdr {
 	__le32 module_id;
 	__le32 module_vendor;
 	__le32 date;		/* BCD YYYMMDD */
-	__le32 size;
+	__le32 size;            /* in dwords */
 	__le32 key_size;	/* in dwords */
 	__le32 modulus_size;	/* in dwords */
 	__le32 exponent_size;	/* in dwords */
-- 
cgit v1.2.3


From f8fc3295570115267ce1ce901f362d13d194aefc Mon Sep 17 00:00:00 2001
From: Cindy H Kao <cindy.h.kao@intel.com>
Date: Fri, 4 Sep 2009 17:38:46 -0700
Subject: wimax/iwmc3200: add new sdio device ID to support iwmc3200 2.5GHz sku

Different sdio device IDs are designated to support different intel
wimax silicon sku. The new macro SDIO_DEVICE_ID_IWMC3200_WIMAX_2G5(0x1407)
is added to support iwmc3200 2.5GHz sku.  The existing
SDIO_DEVICE_ID_IWMC3200_WIMAX(0x1402) is for iwmc3200 general sku.

Signed-off-by: Cindy H Kao <cindy.h.kao@intel.com>
Signed-off-by: Inaky Perez-Gonzalez <inaky@linux.intel.com>
---
 drivers/net/wimax/i2400m/sdio.c | 2 ++
 include/linux/mmc/sdio_ids.h    | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/wimax/i2400m/sdio.c b/drivers/net/wimax/i2400m/sdio.c
index 2d2cc5ac6d86..de158ee0c32c 100644
--- a/drivers/net/wimax/i2400m/sdio.c
+++ b/drivers/net/wimax/i2400m/sdio.c
@@ -550,6 +550,8 @@ const struct sdio_device_id i2400ms_sdio_ids[] = {
 	/* Intel: i2400m WiMAX (iwmc3200) over SDIO */
 	{ SDIO_DEVICE(SDIO_VENDOR_ID_INTEL,
 		      SDIO_DEVICE_ID_INTEL_IWMC3200WIMAX) },
+	{ SDIO_DEVICE(SDIO_VENDOR_ID_INTEL,
+		      SDIO_DEVICE_ID_INTEL_IWMC3200WIMAX_2G5) },
 	{ /* end: all zeroes */ },
 };
 MODULE_DEVICE_TABLE(sdio, i2400ms_sdio_ids);
diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h
index 2dbfb5a05994..33b2ea09a4ad 100644
--- a/include/linux/mmc/sdio_ids.h
+++ b/include/linux/mmc/sdio_ids.h
@@ -28,6 +28,7 @@
 #define SDIO_DEVICE_ID_INTEL_IWMC3200TOP	0x1404
 #define SDIO_DEVICE_ID_INTEL_IWMC3200GPS	0x1405
 #define SDIO_DEVICE_ID_INTEL_IWMC3200BT		0x1406
+#define SDIO_DEVICE_ID_INTEL_IWMC3200WIMAX_2G5	0x1407
 
 #define SDIO_VENDOR_ID_MARVELL			0x02df
 #define SDIO_DEVICE_ID_MARVELL_LIBERTAS		0x9103
-- 
cgit v1.2.3


From 7329012e673231dee9a21567cfb9881f5ea462ba Mon Sep 17 00:00:00 2001
From: Dirk Brandewie <dirk.j.brandewie@intel.com>
Date: Wed, 12 Aug 2009 11:29:46 -0700
Subject: wimax/i6x50: add Intel WiFi/WiMAX Link 6050 Series support

Add support for the WiMAX device in the Intel WiFi/WiMAX Link 6050
Series; this involves:

 - adding the device ID to bind to and an endpoint mapping for the
   driver to use.

 - at probe() time, some things are set depending on the device id:

   + the list of firmware names to try

   + mapping of endpoints

Signed-off-by: Dirk Brandewie <dirk.j.brandewie@intel.com>
Signed-off-by: Inaky Perez-Gonzalez <inaky@linux.intel.com>
---
 drivers/net/wimax/i2400m/fw.c         |  3 +++
 drivers/net/wimax/i2400m/i2400m-usb.h |  3 +++
 drivers/net/wimax/i2400m/usb.c        | 26 +++++++++++++++++++-------
 include/linux/wimax/i2400m.h          |  1 +
 4 files changed, 26 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wimax/i2400m/fw.c b/drivers/net/wimax/i2400m/fw.c
index 84a39c30c3d3..5719f4a4080f 100644
--- a/drivers/net/wimax/i2400m/fw.c
+++ b/drivers/net/wimax/i2400m/fw.c
@@ -277,6 +277,9 @@ int i2400m_barker_db_known_barkers(void)
 	result = i2400m_barker_db_add(I2400M_SBOOT_BARKER);
 	if (result < 0)
 		goto error_add;
+	result = i2400m_barker_db_add(I2400M_SBOOT_BARKER_6050);
+	if (result < 0)
+		goto error_add;
 error_add:
        return result;
 }
diff --git a/drivers/net/wimax/i2400m/i2400m-usb.h b/drivers/net/wimax/i2400m/i2400m-usb.h
index f73a067e0668..5cc0f279417e 100644
--- a/drivers/net/wimax/i2400m/i2400m-usb.h
+++ b/drivers/net/wimax/i2400m/i2400m-usb.h
@@ -148,6 +148,9 @@ enum {
 	I2400MU_MAX_NOTIFICATION_LEN = 256,
 	I2400MU_BLK_SIZE = 16,
 	I2400MU_PL_SIZE_MAX = 0x3EFF,
+
+	/* Device IDs */
+	USB_DEVICE_ID_I6050 = 0x0186,
 };
 
 
diff --git a/drivers/net/wimax/i2400m/usb.c b/drivers/net/wimax/i2400m/usb.c
index 063422290a43..77d08d928274 100644
--- a/drivers/net/wimax/i2400m/usb.c
+++ b/drivers/net/wimax/i2400m/usb.c
@@ -80,11 +80,16 @@ MODULE_PARM_DESC(debug,
 		 "initial debug value to set.");
 
 /* Our firmware file name */
-static const char *i2400mu_bus_fw_names[] = {
+static const char *i2400mu_bus_fw_names_5x50[] = {
 #define I2400MU_FW_FILE_NAME_v1_4 "i2400m-fw-usb-1.4.sbcf"
 	I2400MU_FW_FILE_NAME_v1_4,
-#define I2400MU_FW_FILE_NAME_v1_3 "i2400m-fw-usb-1.3.sbcf"
-	I2400MU_FW_FILE_NAME_v1_3,
+	NULL,
+};
+
+
+static const char *i2400mu_bus_fw_names_6050[] = {
+#define I6050U_FW_FILE_NAME_v1_5 "i6050-fw-usb-1.5.sbcf"
+	I6050U_FW_FILE_NAME_v1_5,
 	NULL,
 };
 
@@ -418,10 +423,16 @@ int i2400mu_probe(struct usb_interface *iface,
 	i2400m->bus_bm_retries = I2400M_USB_BOOT_RETRIES;
 	i2400m->bus_bm_cmd_send = i2400mu_bus_bm_cmd_send;
 	i2400m->bus_bm_wait_for_ack = i2400mu_bus_bm_wait_for_ack;
-	i2400m->bus_fw_names = i2400mu_bus_fw_names;
 	i2400m->bus_bm_mac_addr_impaired = 0;
 
-	{
+	if (id->idProduct == USB_DEVICE_ID_I6050) {
+		i2400m->bus_fw_names = i2400mu_bus_fw_names_6050;
+		i2400mu->endpoint_cfg.bulk_out = 0;
+		i2400mu->endpoint_cfg.notification = 3;
+		i2400mu->endpoint_cfg.reset_cold = 2;
+		i2400mu->endpoint_cfg.bulk_in = 1;
+	} else {
+		i2400m->bus_fw_names = i2400mu_bus_fw_names_5x50;
 		i2400mu->endpoint_cfg.bulk_out = 0;
 		i2400mu->endpoint_cfg.notification = 1;
 		i2400mu->endpoint_cfg.reset_cold = 2;
@@ -614,6 +625,7 @@ out:
 
 static
 struct usb_device_id i2400mu_id_table[] = {
+	{ USB_DEVICE(0x8086, USB_DEVICE_ID_I6050) },
 	{ USB_DEVICE(0x8086, 0x0181) },
 	{ USB_DEVICE(0x8086, 0x1403) },
 	{ USB_DEVICE(0x8086, 0x1405) },
@@ -656,7 +668,7 @@ void __exit i2400mu_driver_exit(void)
 module_exit(i2400mu_driver_exit);
 
 MODULE_AUTHOR("Intel Corporation <linux-wimax@intel.com>");
-MODULE_DESCRIPTION("Intel 2400M WiMAX networking for USB");
+MODULE_DESCRIPTION("Driver for USB based Intel Wireless WiMAX Connection 2400M "
+		   "(5x50 & 6050)");
 MODULE_LICENSE("GPL");
 MODULE_FIRMWARE(I2400MU_FW_FILE_NAME_v1_4);
-MODULE_FIRMWARE(I2400MU_FW_FILE_NAME_v1_3);
diff --git a/include/linux/wimax/i2400m.h b/include/linux/wimax/i2400m.h
index fd5af05083cb..62d356153565 100644
--- a/include/linux/wimax/i2400m.h
+++ b/include/linux/wimax/i2400m.h
@@ -266,6 +266,7 @@ enum {
 	I2400M_WARM_RESET_BARKER = 0x50f750f7,
 	I2400M_NBOOT_BARKER = 0xdeadbeef,
 	I2400M_SBOOT_BARKER = 0x0ff1c1a1,
+	I2400M_SBOOT_BARKER_6050 = 0x80000001,
 	I2400M_ACK_BARKER = 0xfeedbabe,
 	I2400M_D2H_MSG_BARKER = 0xbeefbabe,
 };
-- 
cgit v1.2.3


From 7e75f93eda027d9f9e0203ee6ffd210ea92e98f3 Mon Sep 17 00:00:00 2001
From: jamal <hadi@cyberus.ca>
Date: Mon, 19 Oct 2009 02:17:56 +0000
Subject: pkt_sched: ingress socket filter by mark

Allow bpf to set a filter to drop packets that dont
match a specific mark

Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h | 3 ++-
 net/core/filter.c      | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1354aaf6abbe..909193e25395 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -123,7 +123,8 @@ struct sock_fprog	/* Required for SO_ATTACH_FILTER. */
 #define SKF_AD_IFINDEX 	8
 #define SKF_AD_NLATTR	12
 #define SKF_AD_NLATTR_NEST	16
-#define SKF_AD_MAX	20
+#define SKF_AD_MARK 	20
+#define SKF_AD_MAX	24
 #define SKF_NET_OFF   (-0x100000)
 #define SKF_LL_OFF    (-0x200000)
 
diff --git a/net/core/filter.c b/net/core/filter.c
index d1d779ca096d..e3987e1d4839 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -303,6 +303,9 @@ load_b:
 		case SKF_AD_IFINDEX:
 			A = skb->dev->ifindex;
 			continue;
+		case SKF_AD_MARK:
+			A = skb->mark;
+			continue;
 		case SKF_AD_NLATTR: {
 			struct nlattr *nla;
 
-- 
cgit v1.2.3


From 7b6856a0296a8f187bb88ba31fa83a08abba7966 Mon Sep 17 00:00:00 2001
From: Wolfgang Grandegger <wg@grandegger.com>
Date: Tue, 20 Oct 2009 00:08:01 -0700
Subject: can: provide library functions for skb allocation

This patch makes the private functions alloc_can_skb() and
alloc_can_err_skb() of the at91_can driver public and adapts all
drivers to use these. While making the patch I realized, that
the skb's are *not* setup consistently. It's now done as shown
below:

  skb->protocol = htons(ETH_P_CAN);
  skb->pkt_type = PACKET_BROADCAST;
  skb->ip_summed = CHECKSUM_UNNECESSARY;
  *cf = (struct can_frame *)skb_put(skb, sizeof(struct can_frame));
  memset(*cf, 0, sizeof(struct can_frame));

The frame is zeroed out to avoid uninitialized data to be passed to
user space. Some drivers or library code did not set "pkt_type" or
"ip_summed". Also,  "__constant_htons()" should not be used for
runtime invocations, as pointed out by David Miller.

Signed-off-by: Wolfgang Grandegger <wg@grandegger.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/can/at91_can.c        | 32 -----------------------------
 drivers/net/can/dev.c             | 42 ++++++++++++++++++++++++++++++++-------
 drivers/net/can/sja1000/sja1000.c | 12 ++---------
 drivers/net/can/ti_hecc.c         | 17 ++++------------
 drivers/net/can/usb/ems_usb.c     | 16 ++-------------
 include/linux/can/dev.h           |  4 ++++
 6 files changed, 47 insertions(+), 76 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/can/at91_can.c b/drivers/net/can/at91_can.c
index b13fd9114130..cbe3fce53e3b 100644
--- a/drivers/net/can/at91_can.c
+++ b/drivers/net/can/at91_can.c
@@ -221,38 +221,6 @@ static inline void set_mb_mode(const struct at91_priv *priv, unsigned int mb,
 	set_mb_mode_prio(priv, mb, mode, 0);
 }
 
-static struct sk_buff *alloc_can_skb(struct net_device *dev,
-		struct can_frame **cf)
-{
-	struct sk_buff *skb;
-
-	skb = netdev_alloc_skb(dev, sizeof(struct can_frame));
-	if (unlikely(!skb))
-		return NULL;
-
-	skb->protocol = htons(ETH_P_CAN);
-	skb->ip_summed = CHECKSUM_UNNECESSARY;
-	*cf = (struct can_frame *)skb_put(skb, sizeof(struct can_frame));
-
-	return skb;
-}
-
-static struct sk_buff *alloc_can_err_skb(struct net_device *dev,
-		struct can_frame **cf)
-{
-	struct sk_buff *skb;
-
-	skb = alloc_can_skb(dev, cf);
-	if (unlikely(!skb))
-		return NULL;
-
-	memset(*cf, 0, sizeof(struct can_frame));
-	(*cf)->can_id = CAN_ERR_FLAG;
-	(*cf)->can_dlc = CAN_ERR_DLC;
-
-	return skb;
-}
-
 /*
  * Swtich transceiver on or off
  */
diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index 39b99f57c265..c3db111d2ff5 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -366,17 +366,12 @@ void can_restart(unsigned long data)
 	can_flush_echo_skb(dev);
 
 	/* send restart message upstream */
-	skb = dev_alloc_skb(sizeof(struct can_frame));
+	skb = alloc_can_err_skb(dev, &cf);
 	if (skb == NULL) {
 		err = -ENOMEM;
 		goto restart;
 	}
-	skb->dev = dev;
-	skb->protocol = htons(ETH_P_CAN);
-	cf = (struct can_frame *)skb_put(skb, sizeof(struct can_frame));
-	memset(cf, 0, sizeof(struct can_frame));
-	cf->can_id = CAN_ERR_FLAG | CAN_ERR_RESTARTED;
-	cf->can_dlc = CAN_ERR_DLC;
+	cf->can_id |= CAN_ERR_RESTARTED;
 
 	netif_rx(skb);
 
@@ -449,6 +444,39 @@ static void can_setup(struct net_device *dev)
 	dev->features = NETIF_F_NO_CSUM;
 }
 
+struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf)
+{
+	struct sk_buff *skb;
+
+	skb = netdev_alloc_skb(dev, sizeof(struct can_frame));
+	if (unlikely(!skb))
+		return NULL;
+
+	skb->protocol = htons(ETH_P_CAN);
+	skb->pkt_type = PACKET_BROADCAST;
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+	*cf = (struct can_frame *)skb_put(skb, sizeof(struct can_frame));
+	memset(*cf, 0, sizeof(struct can_frame));
+
+	return skb;
+}
+EXPORT_SYMBOL_GPL(alloc_can_skb);
+
+struct sk_buff *alloc_can_err_skb(struct net_device *dev, struct can_frame **cf)
+{
+	struct sk_buff *skb;
+
+	skb = alloc_can_skb(dev, cf);
+	if (unlikely(!skb))
+		return NULL;
+
+	(*cf)->can_id = CAN_ERR_FLAG;
+	(*cf)->can_dlc = CAN_ERR_DLC;
+
+	return skb;
+}
+EXPORT_SYMBOL_GPL(alloc_can_err_skb);
+
 /*
  * Allocate and setup space for the CAN network device
  */
diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c
index 96d8be4253f8..1a9c5958bbd7 100644
--- a/drivers/net/can/sja1000/sja1000.c
+++ b/drivers/net/can/sja1000/sja1000.c
@@ -296,11 +296,9 @@ static void sja1000_rx(struct net_device *dev)
 	uint8_t dlc;
 	int i;
 
-	skb = dev_alloc_skb(sizeof(struct can_frame));
+	skb = alloc_can_skb(dev, &cf);
 	if (skb == NULL)
 		return;
-	skb->dev = dev;
-	skb->protocol = htons(ETH_P_CAN);
 
 	fi = priv->read_reg(priv, REG_FI);
 	dlc = fi & 0x0F;
@@ -351,15 +349,9 @@ static int sja1000_err(struct net_device *dev, uint8_t isrc, uint8_t status)
 	enum can_state state = priv->can.state;
 	uint8_t ecc, alc;
 
-	skb = dev_alloc_skb(sizeof(struct can_frame));
+	skb = alloc_can_err_skb(dev, &cf);
 	if (skb == NULL)
 		return -ENOMEM;
-	skb->dev = dev;
-	skb->protocol = htons(ETH_P_CAN);
-	cf = (struct can_frame *)skb_put(skb, sizeof(struct can_frame));
-	memset(cf, 0, sizeof(struct can_frame));
-	cf->can_id = CAN_ERR_FLAG;
-	cf->can_dlc = CAN_ERR_DLC;
 
 	if (isrc & IRQ_DOI) {
 		/* data overrun interrupt */
diff --git a/drivers/net/can/ti_hecc.c b/drivers/net/can/ti_hecc.c
index 23a7128e4eb7..07e8016b17ec 100644
--- a/drivers/net/can/ti_hecc.c
+++ b/drivers/net/can/ti_hecc.c
@@ -535,18 +535,15 @@ static int ti_hecc_rx_pkt(struct ti_hecc_priv *priv, int mbxno)
 	u32 data, mbx_mask;
 	unsigned long flags;
 
-	skb = netdev_alloc_skb(priv->ndev, sizeof(struct can_frame));
+	skb = alloc_can_skb(priv->ndev, &cf);
 	if (!skb) {
 		if (printk_ratelimit())
 			dev_err(priv->ndev->dev.parent,
-				"ti_hecc_rx_pkt: netdev_alloc_skb() failed\n");
+				"ti_hecc_rx_pkt: alloc_can_skb() failed\n");
 		return -ENOMEM;
 	}
-	skb->protocol = __constant_htons(ETH_P_CAN);
-	skb->ip_summed = CHECKSUM_UNNECESSARY;
 
 	mbx_mask = BIT(mbxno);
-	cf = (struct can_frame *)skb_put(skb, sizeof(struct can_frame));
 	data = hecc_read_mbx(priv, mbxno, HECC_CANMID);
 	if (data & HECC_CANMID_IDE)
 		cf->can_id = (data & CAN_EFF_MASK) | CAN_EFF_FLAG;
@@ -656,19 +653,13 @@ static int ti_hecc_error(struct net_device *ndev, int int_status,
 	struct sk_buff *skb;
 
 	/* propogate the error condition to the can stack */
-	skb = netdev_alloc_skb(ndev, sizeof(struct can_frame));
+	skb = alloc_can_err_skb(ndev, &cf);
 	if (!skb) {
 		if (printk_ratelimit())
 			dev_err(priv->ndev->dev.parent,
-				"ti_hecc_error: netdev_alloc_skb() failed\n");
+				"ti_hecc_error: alloc_can_err_skb() failed\n");
 		return -ENOMEM;
 	}
-	skb->protocol = __constant_htons(ETH_P_CAN);
-	skb->ip_summed = CHECKSUM_UNNECESSARY;
-	cf = (struct can_frame *)skb_put(skb, sizeof(struct can_frame));
-	memset(cf, 0, sizeof(struct can_frame));
-	cf->can_id = CAN_ERR_FLAG;
-	cf->can_dlc = CAN_ERR_DLC;
 
 	if (int_status & HECC_CANGIF_WLIF) { /* warning level int */
 		if ((int_status & HECC_CANGIF_BOIF) == 0) {
diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c
index a65f56a9cd3d..3685f3e42d12 100644
--- a/drivers/net/can/usb/ems_usb.c
+++ b/drivers/net/can/usb/ems_usb.c
@@ -311,14 +311,10 @@ static void ems_usb_rx_can_msg(struct ems_usb *dev, struct ems_cpc_msg *msg)
 	int i;
 	struct net_device_stats *stats = &dev->netdev->stats;
 
-	skb = netdev_alloc_skb(dev->netdev, sizeof(struct can_frame));
+	skb = alloc_can_skb(dev->netdev, &cf);
 	if (skb == NULL)
 		return;
 
-	skb->protocol = htons(ETH_P_CAN);
-
-	cf = (struct can_frame *)skb_put(skb, sizeof(struct can_frame));
-
 	cf->can_id = msg->msg.can_msg.id;
 	cf->can_dlc = min_t(u8, msg->msg.can_msg.length, 8);
 
@@ -346,18 +342,10 @@ static void ems_usb_rx_err(struct ems_usb *dev, struct ems_cpc_msg *msg)
 	struct sk_buff *skb;
 	struct net_device_stats *stats = &dev->netdev->stats;
 
-	skb = netdev_alloc_skb(dev->netdev, sizeof(struct can_frame));
+	skb = alloc_can_err_skb(dev->netdev, &cf);
 	if (skb == NULL)
 		return;
 
-	skb->protocol = htons(ETH_P_CAN);
-
-	cf = (struct can_frame *)skb_put(skb, sizeof(struct can_frame));
-	memset(cf, 0, sizeof(struct can_frame));
-
-	cf->can_id = CAN_ERR_FLAG;
-	cf->can_dlc = CAN_ERR_DLC;
-
 	if (msg->type == CPC_MSG_TYPE_CAN_STATE) {
 		u8 state = msg->msg.can_state;
 
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 1d3f7f00e3af..1ed2a5cc03f5 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -68,4 +68,8 @@ void can_put_echo_skb(struct sk_buff *skb, struct net_device *dev,
 void can_get_echo_skb(struct net_device *dev, unsigned int idx);
 void can_free_echo_skb(struct net_device *dev, unsigned int idx);
 
+struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf);
+struct sk_buff *alloc_can_err_skb(struct net_device *dev,
+				  struct can_frame **cf);
+
 #endif /* CAN_DEV_H */
-- 
cgit v1.2.3


From d19742fb1c68e6db83b76e06dea5a374c99e104f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 20 Oct 2009 01:06:22 -0700
Subject: filter: Add SKF_AD_QUEUE instruction

It can help being able to filter packets on their queue_mapping.

If filter performance is not good, we could add a "numqueue" field
in struct packet_type, so that netif_nit_deliver() and other functions
can directly ignore packets with not expected queue number.

Lets experiment this simple filter extension first.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h | 3 ++-
 net/core/filter.c      | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 909193e25395..bb3b4352978a 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -124,7 +124,8 @@ struct sock_fprog	/* Required for SO_ATTACH_FILTER. */
 #define SKF_AD_NLATTR	12
 #define SKF_AD_NLATTR_NEST	16
 #define SKF_AD_MARK 	20
-#define SKF_AD_MAX	24
+#define SKF_AD_QUEUE	24
+#define SKF_AD_MAX	28
 #define SKF_NET_OFF   (-0x100000)
 #define SKF_LL_OFF    (-0x200000)
 
diff --git a/net/core/filter.c b/net/core/filter.c
index e3987e1d4839..08db7b9143a3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -306,6 +306,9 @@ load_b:
 		case SKF_AD_MARK:
 			A = skb->mark;
 			continue;
+		case SKF_AD_QUEUE:
+			A = skb->queue_mapping;
+			continue;
 		case SKF_AD_NLATTR: {
 			struct nlattr *nla;
 
-- 
cgit v1.2.3


From 1c55d62e77fa16cdace417834fc7b8a421a1877f Mon Sep 17 00:00:00 2001
From: jamal <hadi@cyberus.ca>
Date: Thu, 15 Oct 2009 03:09:18 +0000
Subject: pkt_sched: skbedit add support for setting mark

This adds support for setting the skb mark.

Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tc_act/tc_skbedit.h |  2 ++
 include/net/tc_act/tc_skbedit.h   |  2 ++
 net/sched/act_skbedit.c           | 17 ++++++++++++++++-
 3 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tc_act/tc_skbedit.h b/include/linux/tc_act/tc_skbedit.h
index a14e461a7af7..7a2e910a5f08 100644
--- a/include/linux/tc_act/tc_skbedit.h
+++ b/include/linux/tc_act/tc_skbedit.h
@@ -26,6 +26,7 @@
 
 #define SKBEDIT_F_PRIORITY		0x1
 #define SKBEDIT_F_QUEUE_MAPPING		0x2
+#define SKBEDIT_F_MARK			0x4
 
 struct tc_skbedit {
 	tc_gen;
@@ -37,6 +38,7 @@ enum {
 	TCA_SKBEDIT_PARMS,
 	TCA_SKBEDIT_PRIORITY,
 	TCA_SKBEDIT_QUEUE_MAPPING,
+	TCA_SKBEDIT_MARK,
 	__TCA_SKBEDIT_MAX
 };
 #define TCA_SKBEDIT_MAX (__TCA_SKBEDIT_MAX - 1)
diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h
index 6abb3ed3ebf7..e103fe02f375 100644
--- a/include/net/tc_act/tc_skbedit.h
+++ b/include/net/tc_act/tc_skbedit.h
@@ -26,7 +26,9 @@ struct tcf_skbedit {
 	struct tcf_common	common;
 	u32			flags;
 	u32     		priority;
+	u32     		mark;
 	u16			queue_mapping;
+	/* XXX: 16-bit pad here? */
 };
 #define to_skbedit(pc) \
 	container_of(pc, struct tcf_skbedit, common)
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 4ab916b8074b..e9607fe55b58 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -54,6 +54,8 @@ static int tcf_skbedit(struct sk_buff *skb, struct tc_action *a,
 	if (d->flags & SKBEDIT_F_QUEUE_MAPPING &&
 	    skb->dev->real_num_tx_queues > d->queue_mapping)
 		skb_set_queue_mapping(skb, d->queue_mapping);
+	if (d->flags & SKBEDIT_F_MARK)
+		skb->mark = d->mark;
 
 	spin_unlock(&d->tcf_lock);
 	return d->tcf_action;
@@ -63,6 +65,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
 	[TCA_SKBEDIT_PARMS]		= { .len = sizeof(struct tc_skbedit) },
 	[TCA_SKBEDIT_PRIORITY]		= { .len = sizeof(u32) },
 	[TCA_SKBEDIT_QUEUE_MAPPING]	= { .len = sizeof(u16) },
+	[TCA_SKBEDIT_MARK]		= { .len = sizeof(u32) },
 };
 
 static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est,
@@ -72,7 +75,7 @@ static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est,
 	struct tc_skbedit *parm;
 	struct tcf_skbedit *d;
 	struct tcf_common *pc;
-	u32 flags = 0, *priority = NULL;
+	u32 flags = 0, *priority = NULL, *mark = NULL;
 	u16 *queue_mapping = NULL;
 	int ret = 0, err;
 
@@ -95,6 +98,12 @@ static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est,
 		flags |= SKBEDIT_F_QUEUE_MAPPING;
 		queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]);
 	}
+
+	if (tb[TCA_SKBEDIT_MARK] != NULL) {
+		flags |= SKBEDIT_F_MARK;
+		mark = nla_data(tb[TCA_SKBEDIT_MARK]);
+	}
+
 	if (!flags)
 		return -EINVAL;
 
@@ -124,6 +133,9 @@ static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est,
 		d->priority = *priority;
 	if (flags & SKBEDIT_F_QUEUE_MAPPING)
 		d->queue_mapping = *queue_mapping;
+	if (flags & SKBEDIT_F_MARK)
+		d->mark = *mark;
+
 	d->tcf_action = parm->action;
 
 	spin_unlock_bh(&d->tcf_lock);
@@ -161,6 +173,9 @@ static inline int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
 	if (d->flags & SKBEDIT_F_QUEUE_MAPPING)
 		NLA_PUT(skb, TCA_SKBEDIT_QUEUE_MAPPING,
 			sizeof(d->queue_mapping), &d->queue_mapping);
+	if (d->flags & SKBEDIT_F_MARK)
+		NLA_PUT(skb, TCA_SKBEDIT_MARK, sizeof(d->mark),
+			&d->mark);
 	t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
 	t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
-- 
cgit v1.2.3


From 40b1f4e5113eafc5e84f2ba86822df66087fcb25 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Thu, 22 Oct 2009 14:39:28 +1100
Subject: irq: trivial: Fix typo in comment for #endif

The comment suggests this #endif is CONFIG_X86 but it's really
CONFIG_TRACE_IRQFLAGS_SUPPORT

Signed-off-by: Michael Neuling <mikey@neuling.org>
Cc: michael@ellerman.id.au
LKML-Reference: <18191.1256182768@neuling.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irqflags.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index b02a3f1d46a0..006bf45eae30 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -124,6 +124,6 @@
 	typecheck(unsigned long, flags);	\
 	raw_irqs_disabled_flags(flags);		\
 })
-#endif		/* CONFIG_X86 */
+#endif /* CONFIG_TRACE_IRQFLAGS_SUPPORT */
 
 #endif
-- 
cgit v1.2.3


From 5c828713358cb9df8aa174371edcbbb62203a490 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Fri, 23 Oct 2009 14:58:11 +0200
Subject: ratelimit: Make suppressed output messages more useful

Today I got:

  [39648.224782] Registered led device: iwl-phy0::TX
  [40676.545099] __ratelimit: 246 callbacks suppressed
  [40676.545103] abcdef[23675]: segfault at 0 ...

as you can see the ratelimit message contains a function prefix.
Since this is always __ratelimit, this wont help much.

This patch changes __ratelimit and printk_ratelimit to print the
function name that calls ratelimit.

This will pinpoint the responsible function, as long as not several
different places call ratelimit with the same ratelimit state at
the same time. In that case we catch only one random function that
calls ratelimit after the wait period.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Young <hidave.darkstar@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
CC: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <200910231458.11832.borntraeger@de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel.h    | 3 ++-
 include/linux/ratelimit.h | 3 ++-
 kernel/printk.c           | 6 +++---
 lib/ratelimit.c           | 6 +++---
 4 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 3305f33201be..21d0d822c716 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -240,7 +240,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 asmlinkage int printk(const char * fmt, ...)
 	__attribute__ ((format (printf, 1, 2))) __cold;
 
-extern int printk_ratelimit(void);
+extern int __printk_ratelimit(const char *func);
+#define printk_ratelimit() __printk_ratelimit(__func__)
 extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 				   unsigned int interval_msec);
 
diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h
index 187bc16c1f15..668cf1bef030 100644
--- a/include/linux/ratelimit.h
+++ b/include/linux/ratelimit.h
@@ -25,6 +25,7 @@ struct ratelimit_state {
 		.burst		= burst_init,				\
 	}
 
-extern int __ratelimit(struct ratelimit_state *rs);
+extern int ___ratelimit(struct ratelimit_state *rs, const char *func);
+#define __ratelimit(state) ___ratelimit(state, __func__)
 
 #endif /* _LINUX_RATELIMIT_H */
diff --git a/kernel/printk.c b/kernel/printk.c
index b997c893cdcf..8283dbe15af4 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1364,11 +1364,11 @@ late_initcall(disable_boot_consoles);
  */
 DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
 
-int printk_ratelimit(void)
+int __printk_ratelimit(const char *func)
 {
-	return __ratelimit(&printk_ratelimit_state);
+	return ___ratelimit(&printk_ratelimit_state, func);
 }
-EXPORT_SYMBOL(printk_ratelimit);
+EXPORT_SYMBOL(__printk_ratelimit);
 
 /**
  * printk_timed_ratelimit - caller-controlled printk ratelimiting
diff --git a/lib/ratelimit.c b/lib/ratelimit.c
index 5551731ae1d4..09f5ce1810dc 100644
--- a/lib/ratelimit.c
+++ b/lib/ratelimit.c
@@ -20,7 +20,7 @@
  * This enforces a rate limit: not more than @rs->ratelimit_burst callbacks
  * in every @rs->ratelimit_jiffies
  */
-int __ratelimit(struct ratelimit_state *rs)
+int ___ratelimit(struct ratelimit_state *rs, const char *func)
 {
 	unsigned long flags;
 	int ret;
@@ -43,7 +43,7 @@ int __ratelimit(struct ratelimit_state *rs)
 	if (time_is_before_jiffies(rs->begin + rs->interval)) {
 		if (rs->missed)
 			printk(KERN_WARNING "%s: %d callbacks suppressed\n",
-				__func__, rs->missed);
+				func, rs->missed);
 		rs->begin   = 0;
 		rs->printed = 0;
 		rs->missed  = 0;
@@ -59,4 +59,4 @@ int __ratelimit(struct ratelimit_state *rs)
 
 	return ret;
 }
-EXPORT_SYMBOL(__ratelimit);
+EXPORT_SYMBOL(___ratelimit);
-- 
cgit v1.2.3


From bb015f0c85362aa767f8f00f50a40d85e489414f Mon Sep 17 00:00:00 2001
From: Wolfram Sang <w.sang@pengutronix.de>
Date: Mon, 19 Oct 2009 11:43:32 +0200
Subject: pcmcia: drop already defined PCI_IDs

Out of 10 PCI_IDs found in the PCMCIA subsystem, only two were not defined in
pci_ids.h. Move them and drop the duplicates. Successfully build-tested.

Signed-off-by: Wolfram Sang <w.sang@pengutronix.de>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 drivers/pcmcia/cirrus.h  | 10 ----------
 drivers/pcmcia/o2micro.h | 22 ----------------------
 include/linux/pci_ids.h  |  2 ++
 3 files changed, 2 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pcmcia/cirrus.h b/drivers/pcmcia/cirrus.h
index ecd4fc7f666f..446a4576e73e 100644
--- a/drivers/pcmcia/cirrus.h
+++ b/drivers/pcmcia/cirrus.h
@@ -30,16 +30,6 @@
 #ifndef _LINUX_CIRRUS_H
 #define _LINUX_CIRRUS_H
 
-#ifndef PCI_VENDOR_ID_CIRRUS
-#define PCI_VENDOR_ID_CIRRUS		0x1013
-#endif
-#ifndef PCI_DEVICE_ID_CIRRUS_6729
-#define PCI_DEVICE_ID_CIRRUS_6729	0x1100
-#endif
-#ifndef PCI_DEVICE_ID_CIRRUS_6832
-#define PCI_DEVICE_ID_CIRRUS_6832	0x1110
-#endif
-
 #define PD67_MISC_CTL_1		0x16	/* Misc control 1 */
 #define PD67_FIFO_CTL		0x17	/* FIFO control */
 #define PD67_MISC_CTL_2		0x1E	/* Misc control 2 */
diff --git a/drivers/pcmcia/o2micro.h b/drivers/pcmcia/o2micro.h
index 72188c462c9c..624442fc0d35 100644
--- a/drivers/pcmcia/o2micro.h
+++ b/drivers/pcmcia/o2micro.h
@@ -30,28 +30,6 @@
 #ifndef _LINUX_O2MICRO_H
 #define _LINUX_O2MICRO_H
 
-#ifndef PCI_VENDOR_ID_O2
-#define PCI_VENDOR_ID_O2		0x1217
-#endif
-#ifndef PCI_DEVICE_ID_O2_6729
-#define PCI_DEVICE_ID_O2_6729		0x6729
-#endif
-#ifndef PCI_DEVICE_ID_O2_6730
-#define PCI_DEVICE_ID_O2_6730		0x673a
-#endif
-#ifndef PCI_DEVICE_ID_O2_6832
-#define PCI_DEVICE_ID_O2_6832		0x6832
-#endif
-#ifndef PCI_DEVICE_ID_O2_6836
-#define PCI_DEVICE_ID_O2_6836		0x6836
-#endif
-#ifndef PCI_DEVICE_ID_O2_6812
-#define PCI_DEVICE_ID_O2_6812		0x6872
-#endif
-#ifndef PCI_DEVICE_ID_O2_6933
-#define PCI_DEVICE_ID_O2_6933           0x6933
-#endif
-
 /* Additional PCI configuration registers */
 
 #define O2_MUX_CONTROL		0x90	/* 32 bit */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index f490e7a7307a..857cc349bf71 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1632,6 +1632,8 @@
 #define PCI_DEVICE_ID_O2_6730		0x673a
 #define PCI_DEVICE_ID_O2_6832		0x6832
 #define PCI_DEVICE_ID_O2_6836		0x6836
+#define PCI_DEVICE_ID_O2_6812		0x6872
+#define PCI_DEVICE_ID_O2_6933		0x6933
 
 #define PCI_VENDOR_ID_3DFX		0x121a
 #define PCI_DEVICE_ID_3DFX_VOODOO	0x0001
-- 
cgit v1.2.3


From ce0e7b28fb75cb003cfc8d0238613aaf1c55e797 Mon Sep 17 00:00:00 2001
From: Ryota Ozaki <ozaki.ryota@gmail.com>
Date: Sat, 24 Oct 2009 01:20:10 +0900
Subject: sched, cpuacct: Fix niced guest time accounting

CPU time of a guest is always accounted in 'user' time
without concern for the nice value of its counterpart
process although the guest is scheduled under the nice
value.

This patch fixes the defect and accounts cpu time of
a niced guest in 'nice' time as same as a niced process.

And also the patch adds 'guest_nice' to cpuacct. The
value provides niced guest cpu time which is like 'nice'
to 'user'.

The original discussions can be found here:

  http://www.mail-archive.com/kvm@vger.kernel.org/msg23982.html
  http://www.mail-archive.com/kvm@vger.kernel.org/msg23860.html

Signed-off-by: Ryota Ozaki <ozaki.ryota@gmail.com>
Acked-by: Avi Kivity <avi@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1256314810-7897-1-git-send-email-ozaki.ryota@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/filesystems/proc.txt |  3 ++-
 fs/proc/stat.c                     | 19 +++++++++++++------
 include/linux/kernel_stat.h        |  1 +
 kernel/sched.c                     |  9 +++++++--
 4 files changed, 23 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 2c48f945546b..4af0018533f2 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1072,7 +1072,8 @@ second).  The meanings of the columns are as follows, from left to right:
 - irq: servicing interrupts
 - softirq: servicing softirqs
 - steal: involuntary wait
-- guest: running a guest
+- guest: running a normal guest
+- guest_nice: running a niced guest
 
 The "intr" line gives counts of interrupts  serviced since boot time, for each
 of the  possible system interrupts.   The first  column  is the  total of  all
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 7cc726c6d70a..b9b7aad2003d 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -27,7 +27,7 @@ static int show_stat(struct seq_file *p, void *v)
 	int i, j;
 	unsigned long jif;
 	cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
-	cputime64_t guest;
+	cputime64_t guest, guest_nice;
 	u64 sum = 0;
 	u64 sum_softirq = 0;
 	unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
@@ -36,7 +36,7 @@ static int show_stat(struct seq_file *p, void *v)
 
 	user = nice = system = idle = iowait =
 		irq = softirq = steal = cputime64_zero;
-	guest = cputime64_zero;
+	guest = guest_nice = cputime64_zero;
 	getboottime(&boottime);
 	jif = boottime.tv_sec;
 
@@ -51,6 +51,8 @@ static int show_stat(struct seq_file *p, void *v)
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
+		guest_nice = cputime64_add(guest_nice,
+			kstat_cpu(i).cpustat.guest_nice);
 		for_each_irq_nr(j) {
 			sum += kstat_irqs_cpu(j, i);
 		}
@@ -65,7 +67,8 @@ static int show_stat(struct seq_file *p, void *v)
 	}
 	sum += arch_irq_stat();
 
-	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu "
+		"%llu\n",
 		(unsigned long long)cputime64_to_clock_t(user),
 		(unsigned long long)cputime64_to_clock_t(nice),
 		(unsigned long long)cputime64_to_clock_t(system),
@@ -74,7 +77,8 @@ static int show_stat(struct seq_file *p, void *v)
 		(unsigned long long)cputime64_to_clock_t(irq),
 		(unsigned long long)cputime64_to_clock_t(softirq),
 		(unsigned long long)cputime64_to_clock_t(steal),
-		(unsigned long long)cputime64_to_clock_t(guest));
+		(unsigned long long)cputime64_to_clock_t(guest),
+		(unsigned long long)cputime64_to_clock_t(guest_nice));
 	for_each_online_cpu(i) {
 
 		/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
@@ -88,8 +92,10 @@ static int show_stat(struct seq_file *p, void *v)
 		softirq = kstat_cpu(i).cpustat.softirq;
 		steal = kstat_cpu(i).cpustat.steal;
 		guest = kstat_cpu(i).cpustat.guest;
+		guest_nice = kstat_cpu(i).cpustat.guest_nice;
 		seq_printf(p,
-			"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+			"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu "
+			"%llu\n",
 			i,
 			(unsigned long long)cputime64_to_clock_t(user),
 			(unsigned long long)cputime64_to_clock_t(nice),
@@ -99,7 +105,8 @@ static int show_stat(struct seq_file *p, void *v)
 			(unsigned long long)cputime64_to_clock_t(irq),
 			(unsigned long long)cputime64_to_clock_t(softirq),
 			(unsigned long long)cputime64_to_clock_t(steal),
-			(unsigned long long)cputime64_to_clock_t(guest));
+			(unsigned long long)cputime64_to_clock_t(guest),
+			(unsigned long long)cputime64_to_clock_t(guest_nice));
 	}
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 348fa8874b52..c059044bc6dc 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -25,6 +25,7 @@ struct cpu_usage_stat {
 	cputime64_t iowait;
 	cputime64_t steal;
 	cputime64_t guest;
+	cputime64_t guest_nice;
 };
 
 struct kernel_stat {
diff --git a/kernel/sched.c b/kernel/sched.c
index e5205811c19e..67be4d0dddaa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5017,8 +5017,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
 	p->gtime = cputime_add(p->gtime, cputime);
 
 	/* Add guest time to cpustat. */
-	cpustat->user = cputime64_add(cpustat->user, tmp);
-	cpustat->guest = cputime64_add(cpustat->guest, tmp);
+	if (TASK_NICE(p) > 0) {
+		cpustat->nice = cputime64_add(cpustat->nice, tmp);
+		cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
+	} else {
+		cpustat->user = cputime64_add(cpustat->user, tmp);
+		cpustat->guest = cputime64_add(cpustat->guest, tmp);
+	}
 }
 
 /*
-- 
cgit v1.2.3


From 0b83ddebc6e884dc0221358cf68c461520fbdd8e Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@nokia.com>
Date: Thu, 22 Oct 2009 13:26:45 +0300
Subject: MFD: twl4030: add twl4030_codec MFD as a new child to the core

New MFD child to twl4030 MFD device.

Reason for the twl4030_codec MFD: the vibra control is actually in the codec
part of the twl4030. If both the vibra and the audio functionality is needed
from the twl4030 at the same time, than they need to control the codec power
and APLL at the same time without breaking the other driver.
Also these two has to be able to work without the need for the other driver.

This MFD device will be used by the drivers, which needs resources
from the twl4030 codec like audio and vibra.

The platform specific configuration data is passed along to the
child drivers (audio, vibra).

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@nokia.com>
Acked-by: Samuel Ortiz <sameo@linux.intel.com>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/mfd/Kconfig               |   6 +
 drivers/mfd/Makefile              |   1 +
 drivers/mfd/twl4030-codec.c       | 241 +++++++++++++++++++++++++++++++++
 drivers/mfd/twl4030-core.c        |  14 ++
 include/linux/i2c/twl4030.h       |  18 +++
 include/linux/mfd/twl4030-codec.h | 271 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 551 insertions(+)
 create mode 100644 drivers/mfd/twl4030-codec.c
 create mode 100644 include/linux/mfd/twl4030-codec.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 570be139f9df..08f2d07bf56a 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -121,6 +121,12 @@ config TWL4030_POWER
 	  and load scripts controling which resources are switched off/on
 	  or reset when a sleep, wakeup or warm reset event occurs.
 
+config TWL4030_CODEC
+	bool
+	depends on TWL4030_CORE
+	select MFD_CORE
+	default n
+
 config MFD_TMIO
 	bool
 	default n
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index f3b277b90d40..af0fc903cec8 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_MENELAUS)		+= menelaus.o
 
 obj-$(CONFIG_TWL4030_CORE)	+= twl4030-core.o twl4030-irq.o
 obj-$(CONFIG_TWL4030_POWER)    += twl4030-power.o
+obj-$(CONFIG_TWL4030_CODEC)	+= twl4030-codec.o
 
 obj-$(CONFIG_MFD_MC13783)	+= mc13783-core.o
 
diff --git a/drivers/mfd/twl4030-codec.c b/drivers/mfd/twl4030-codec.c
new file mode 100644
index 000000000000..97103078dc2a
--- /dev/null
+++ b/drivers/mfd/twl4030-codec.c
@@ -0,0 +1,241 @@
+/*
+ * MFD driver for twl4030 codec submodule
+ *
+ * Author:	Peter Ujfalusi <peter.ujfalusi@nokia.com>
+ *
+ * Copyright:   (C) 2009 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/platform_device.h>
+#include <linux/i2c/twl4030.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/twl4030-codec.h>
+
+#define TWL4030_CODEC_CELLS	2
+
+static struct platform_device *twl4030_codec_dev;
+
+struct twl4030_codec_resource {
+	int request_count;
+	u8 reg;
+	u8 mask;
+};
+
+struct twl4030_codec {
+	struct mutex mutex;
+	struct twl4030_codec_resource resource[TWL4030_CODEC_RES_MAX];
+	struct mfd_cell cells[TWL4030_CODEC_CELLS];
+};
+
+/*
+ * Modify the resource, the function returns the content of the register
+ * after the modification.
+ */
+static int twl4030_codec_set_resource(enum twl4030_codec_res id, int enable)
+{
+	struct twl4030_codec *codec = platform_get_drvdata(twl4030_codec_dev);
+	u8 val;
+
+	twl4030_i2c_read_u8(TWL4030_MODULE_AUDIO_VOICE, &val,
+			codec->resource[id].reg);
+
+	if (enable)
+		val |= codec->resource[id].mask;
+	else
+		val &= ~codec->resource[id].mask;
+
+	twl4030_i2c_write_u8(TWL4030_MODULE_AUDIO_VOICE,
+					val, codec->resource[id].reg);
+
+	return val;
+}
+
+static inline int twl4030_codec_get_resource(enum twl4030_codec_res id)
+{
+	struct twl4030_codec *codec = platform_get_drvdata(twl4030_codec_dev);
+	u8 val;
+
+	twl4030_i2c_read_u8(TWL4030_MODULE_AUDIO_VOICE, &val,
+			codec->resource[id].reg);
+
+	return val;
+}
+
+/*
+ * Enable the resource.
+ * The function returns with error or the content of the register
+ */
+int twl4030_codec_enable_resource(enum twl4030_codec_res id)
+{
+	struct twl4030_codec *codec = platform_get_drvdata(twl4030_codec_dev);
+	int val;
+
+	if (id >= TWL4030_CODEC_RES_MAX) {
+		dev_err(&twl4030_codec_dev->dev,
+				"Invalid resource ID (%u)\n", id);
+		return -EINVAL;
+	}
+
+	mutex_lock(&codec->mutex);
+	if (!codec->resource[id].request_count)
+		/* Resource was disabled, enable it */
+		val = twl4030_codec_set_resource(id, 1);
+	else
+		val = twl4030_codec_get_resource(id);
+
+	codec->resource[id].request_count++;
+	mutex_unlock(&codec->mutex);
+
+	return val;
+}
+EXPORT_SYMBOL_GPL(twl4030_codec_enable_resource);
+
+/*
+ * Disable the resource.
+ * The function returns with error or the content of the register
+ */
+int twl4030_codec_disable_resource(unsigned id)
+{
+	struct twl4030_codec *codec = platform_get_drvdata(twl4030_codec_dev);
+	int val;
+
+	if (id >= TWL4030_CODEC_RES_MAX) {
+		dev_err(&twl4030_codec_dev->dev,
+				"Invalid resource ID (%u)\n", id);
+		return -EINVAL;
+	}
+
+	mutex_lock(&codec->mutex);
+	if (!codec->resource[id].request_count) {
+		dev_err(&twl4030_codec_dev->dev,
+			"Resource has been disabled already (%u)\n", id);
+		mutex_unlock(&codec->mutex);
+		return -EPERM;
+	}
+	codec->resource[id].request_count--;
+
+	if (!codec->resource[id].request_count)
+		/* Resource can be disabled now */
+		val = twl4030_codec_set_resource(id, 0);
+	else
+		val = twl4030_codec_get_resource(id);
+
+	mutex_unlock(&codec->mutex);
+
+	return val;
+}
+EXPORT_SYMBOL_GPL(twl4030_codec_disable_resource);
+
+static int __devinit twl4030_codec_probe(struct platform_device *pdev)
+{
+	struct twl4030_codec *codec;
+	struct twl4030_codec_data *pdata = pdev->dev.platform_data;
+	struct mfd_cell *cell = NULL;
+	int ret, childs = 0;
+
+	codec = kzalloc(sizeof(struct twl4030_codec), GFP_KERNEL);
+	if (!codec)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, codec);
+
+	twl4030_codec_dev = pdev;
+	mutex_init(&codec->mutex);
+
+	/* Codec power */
+	codec->resource[TWL4030_CODEC_RES_POWER].reg = TWL4030_REG_CODEC_MODE;
+	codec->resource[TWL4030_CODEC_RES_POWER].mask = TWL4030_CODECPDZ;
+
+	/* PLL */
+	codec->resource[TWL4030_CODEC_RES_APLL].reg = TWL4030_REG_APLL_CTL;
+	codec->resource[TWL4030_CODEC_RES_APLL].mask = TWL4030_APLL_EN;
+
+	if (pdata->audio) {
+		cell = &codec->cells[childs];
+		cell->name = "twl4030_codec_audio";
+		cell->platform_data = pdata->audio;
+		cell->data_size = sizeof(*pdata->audio);
+		childs++;
+	}
+	if (pdata->vibra) {
+		cell = &codec->cells[childs];
+		cell->name = "twl4030_codec_vibra";
+		cell->platform_data = pdata->vibra;
+		cell->data_size = sizeof(*pdata->vibra);
+		childs++;
+	}
+
+	if (childs)
+		ret = mfd_add_devices(&pdev->dev, pdev->id, codec->cells,
+				      childs, NULL, 0);
+	else {
+		dev_err(&pdev->dev, "No platform data found for childs\n");
+		ret = -ENODEV;
+	}
+
+	if (!ret)
+		return 0;
+
+	platform_set_drvdata(pdev, NULL);
+	kfree(codec);
+	twl4030_codec_dev = NULL;
+	return ret;
+}
+
+static int __devexit twl4030_codec_remove(struct platform_device *pdev)
+{
+	struct twl4030_codec *codec = platform_get_drvdata(pdev);
+
+	mfd_remove_devices(&pdev->dev);
+	platform_set_drvdata(pdev, NULL);
+	kfree(codec);
+	twl4030_codec_dev = NULL;
+
+	return 0;
+}
+
+MODULE_ALIAS("platform:twl4030_codec");
+
+static struct platform_driver twl4030_codec_driver = {
+	.probe		= twl4030_codec_probe,
+	.remove		= __devexit_p(twl4030_codec_remove),
+	.driver		= {
+		.owner	= THIS_MODULE,
+		.name	= "twl4030_codec",
+	},
+};
+
+static int __devinit twl4030_codec_init(void)
+{
+	return platform_driver_register(&twl4030_codec_driver);
+}
+module_init(twl4030_codec_init);
+
+static void __devexit twl4030_codec_exit(void)
+{
+	platform_driver_unregister(&twl4030_codec_driver);
+}
+module_exit(twl4030_codec_exit);
+
+MODULE_AUTHOR("Peter Ujfalusi <peter.ujfalusi@nokia.com>");
+MODULE_LICENSE("GPL");
+
diff --git a/drivers/mfd/twl4030-core.c b/drivers/mfd/twl4030-core.c
index e424cf6d8e9e..0ee81e46bfbd 100644
--- a/drivers/mfd/twl4030-core.c
+++ b/drivers/mfd/twl4030-core.c
@@ -114,6 +114,12 @@
 #define twl_has_watchdog()        false
 #endif
 
+#if defined(CONFIG_TWL4030_CODEC) || defined(CONFIG_TWL4030_CODEC_MODULE)
+#define twl_has_codec()	true
+#else
+#define twl_has_codec()	false
+#endif
+
 /* Triton Core internal information (BEGIN) */
 
 /* Last - for index max*/
@@ -557,6 +563,14 @@ add_children(struct twl4030_platform_data *pdata, unsigned long features)
 			return PTR_ERR(child);
 	}
 
+	if (twl_has_codec() && pdata->codec) {
+		child = add_child(1, "twl4030_codec",
+				pdata->codec, sizeof(*pdata->codec),
+				false, 0, 0);
+		if (IS_ERR(child))
+			return PTR_ERR(child);
+	}
+
 	if (twl_has_regulator()) {
 		/*
 		child = add_regulator(TWL4030_REG_VPLL1, pdata->vpll1);
diff --git a/include/linux/i2c/twl4030.h b/include/linux/i2c/twl4030.h
index 2d02dfd7076c..42d6c722bd82 100644
--- a/include/linux/i2c/twl4030.h
+++ b/include/linux/i2c/twl4030.h
@@ -401,6 +401,23 @@ struct twl4030_power_data {
 
 extern void twl4030_power_init(struct twl4030_power_data *triton2_scripts);
 
+struct twl4030_codec_audio_data {
+	unsigned int	audio_mclk;
+	unsigned int ramp_delay_value;
+	unsigned int hs_extmute:1;
+	void (*set_hs_extmute)(int mute);
+};
+
+struct twl4030_codec_vibra_data {
+	unsigned int	audio_mclk;
+	unsigned int	coexist;
+};
+
+struct twl4030_codec_data {
+	struct twl4030_codec_audio_data		*audio;
+	struct twl4030_codec_vibra_data		*vibra;
+};
+
 struct twl4030_platform_data {
 	unsigned				irq_base, irq_end;
 	struct twl4030_bci_platform_data	*bci;
@@ -409,6 +426,7 @@ struct twl4030_platform_data {
 	struct twl4030_keypad_data		*keypad;
 	struct twl4030_usb_data			*usb;
 	struct twl4030_power_data		*power;
+	struct twl4030_codec_data		*codec;
 
 	/* LDO regulators */
 	struct regulator_init_data		*vdac;
diff --git a/include/linux/mfd/twl4030-codec.h b/include/linux/mfd/twl4030-codec.h
new file mode 100644
index 000000000000..ef0a3041d759
--- /dev/null
+++ b/include/linux/mfd/twl4030-codec.h
@@ -0,0 +1,271 @@
+/*
+ * MFD driver for twl4030 codec submodule
+ *
+ * Author:	Peter Ujfalusi <peter.ujfalusi@nokia.com>
+ *
+ * Copyright:   (C) 2009 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#ifndef __TWL4030_CODEC_H__
+#define __TWL4030_CODEC_H__
+
+/* Codec registers */
+#define TWL4030_REG_CODEC_MODE		0x01
+#define TWL4030_REG_OPTION		0x02
+#define TWL4030_REG_UNKNOWN		0x03
+#define TWL4030_REG_MICBIAS_CTL		0x04
+#define TWL4030_REG_ANAMICL		0x05
+#define TWL4030_REG_ANAMICR		0x06
+#define TWL4030_REG_AVADC_CTL		0x07
+#define TWL4030_REG_ADCMICSEL		0x08
+#define TWL4030_REG_DIGMIXING		0x09
+#define TWL4030_REG_ATXL1PGA		0x0A
+#define TWL4030_REG_ATXR1PGA		0x0B
+#define TWL4030_REG_AVTXL2PGA		0x0C
+#define TWL4030_REG_AVTXR2PGA		0x0D
+#define TWL4030_REG_AUDIO_IF		0x0E
+#define TWL4030_REG_VOICE_IF		0x0F
+#define TWL4030_REG_ARXR1PGA		0x10
+#define TWL4030_REG_ARXL1PGA		0x11
+#define TWL4030_REG_ARXR2PGA		0x12
+#define TWL4030_REG_ARXL2PGA		0x13
+#define TWL4030_REG_VRXPGA		0x14
+#define TWL4030_REG_VSTPGA		0x15
+#define TWL4030_REG_VRX2ARXPGA		0x16
+#define TWL4030_REG_AVDAC_CTL		0x17
+#define TWL4030_REG_ARX2VTXPGA		0x18
+#define TWL4030_REG_ARXL1_APGA_CTL	0x19
+#define TWL4030_REG_ARXR1_APGA_CTL	0x1A
+#define TWL4030_REG_ARXL2_APGA_CTL	0x1B
+#define TWL4030_REG_ARXR2_APGA_CTL	0x1C
+#define TWL4030_REG_ATX2ARXPGA		0x1D
+#define TWL4030_REG_BT_IF		0x1E
+#define TWL4030_REG_BTPGA		0x1F
+#define TWL4030_REG_BTSTPGA		0x20
+#define TWL4030_REG_EAR_CTL		0x21
+#define TWL4030_REG_HS_SEL		0x22
+#define TWL4030_REG_HS_GAIN_SET		0x23
+#define TWL4030_REG_HS_POPN_SET		0x24
+#define TWL4030_REG_PREDL_CTL		0x25
+#define TWL4030_REG_PREDR_CTL		0x26
+#define TWL4030_REG_PRECKL_CTL		0x27
+#define TWL4030_REG_PRECKR_CTL		0x28
+#define TWL4030_REG_HFL_CTL		0x29
+#define TWL4030_REG_HFR_CTL		0x2A
+#define TWL4030_REG_ALC_CTL		0x2B
+#define TWL4030_REG_ALC_SET1		0x2C
+#define TWL4030_REG_ALC_SET2		0x2D
+#define TWL4030_REG_BOOST_CTL		0x2E
+#define TWL4030_REG_SOFTVOL_CTL		0x2F
+#define TWL4030_REG_DTMF_FREQSEL	0x30
+#define TWL4030_REG_DTMF_TONEXT1H	0x31
+#define TWL4030_REG_DTMF_TONEXT1L	0x32
+#define TWL4030_REG_DTMF_TONEXT2H	0x33
+#define TWL4030_REG_DTMF_TONEXT2L	0x34
+#define TWL4030_REG_DTMF_TONOFF		0x35
+#define TWL4030_REG_DTMF_WANONOFF	0x36
+#define TWL4030_REG_I2S_RX_SCRAMBLE_H	0x37
+#define TWL4030_REG_I2S_RX_SCRAMBLE_M	0x38
+#define TWL4030_REG_I2S_RX_SCRAMBLE_L	0x39
+#define TWL4030_REG_APLL_CTL		0x3A
+#define TWL4030_REG_DTMF_CTL		0x3B
+#define TWL4030_REG_DTMF_PGA_CTL2	0x3C
+#define TWL4030_REG_DTMF_PGA_CTL1	0x3D
+#define TWL4030_REG_MISC_SET_1		0x3E
+#define TWL4030_REG_PCMBTMUX		0x3F
+#define TWL4030_REG_RX_PATH_SEL		0x43
+#define TWL4030_REG_VDL_APGA_CTL	0x44
+#define TWL4030_REG_VIBRA_CTL		0x45
+#define TWL4030_REG_VIBRA_SET		0x46
+#define TWL4030_REG_VIBRA_PWM_SET	0x47
+#define TWL4030_REG_ANAMIC_GAIN		0x48
+#define TWL4030_REG_MISC_SET_2		0x49
+
+/* Bitfield Definitions */
+
+/* TWL4030_CODEC_MODE (0x01) Fields */
+#define TWL4030_APLL_RATE		0xF0
+#define TWL4030_APLL_RATE_8000		0x00
+#define TWL4030_APLL_RATE_11025		0x10
+#define TWL4030_APLL_RATE_12000		0x20
+#define TWL4030_APLL_RATE_16000		0x40
+#define TWL4030_APLL_RATE_22050		0x50
+#define TWL4030_APLL_RATE_24000		0x60
+#define TWL4030_APLL_RATE_32000		0x80
+#define TWL4030_APLL_RATE_44100		0x90
+#define TWL4030_APLL_RATE_48000		0xA0
+#define TWL4030_APLL_RATE_96000		0xE0
+#define TWL4030_SEL_16K			0x08
+#define TWL4030_CODECPDZ		0x02
+#define TWL4030_OPT_MODE		0x01
+#define TWL4030_OPTION_1		(1 << 0)
+#define TWL4030_OPTION_2		(0 << 0)
+
+/* TWL4030_OPTION (0x02) Fields */
+#define TWL4030_ATXL1_EN		(1 << 0)
+#define TWL4030_ATXR1_EN		(1 << 1)
+#define TWL4030_ATXL2_VTXL_EN		(1 << 2)
+#define TWL4030_ATXR2_VTXR_EN		(1 << 3)
+#define TWL4030_ARXL1_VRX_EN		(1 << 4)
+#define TWL4030_ARXR1_EN		(1 << 5)
+#define TWL4030_ARXL2_EN		(1 << 6)
+#define TWL4030_ARXR2_EN		(1 << 7)
+
+/* TWL4030_REG_MICBIAS_CTL (0x04) Fields */
+#define TWL4030_MICBIAS2_CTL		0x40
+#define TWL4030_MICBIAS1_CTL		0x20
+#define TWL4030_HSMICBIAS_EN		0x04
+#define TWL4030_MICBIAS2_EN		0x02
+#define TWL4030_MICBIAS1_EN		0x01
+
+/* ANAMICL (0x05) Fields */
+#define TWL4030_CNCL_OFFSET_START	0x80
+#define TWL4030_OFFSET_CNCL_SEL		0x60
+#define TWL4030_OFFSET_CNCL_SEL_ARX1	0x00
+#define TWL4030_OFFSET_CNCL_SEL_ARX2	0x20
+#define TWL4030_OFFSET_CNCL_SEL_VRX	0x40
+#define TWL4030_OFFSET_CNCL_SEL_ALL	0x60
+#define TWL4030_MICAMPL_EN		0x10
+#define TWL4030_CKMIC_EN		0x08
+#define TWL4030_AUXL_EN			0x04
+#define TWL4030_HSMIC_EN		0x02
+#define TWL4030_MAINMIC_EN		0x01
+
+/* ANAMICR (0x06) Fields */
+#define TWL4030_MICAMPR_EN		0x10
+#define TWL4030_AUXR_EN			0x04
+#define TWL4030_SUBMIC_EN		0x01
+
+/* AVADC_CTL (0x07) Fields */
+#define TWL4030_ADCL_EN			0x08
+#define TWL4030_AVADC_CLK_PRIORITY	0x04
+#define TWL4030_ADCR_EN			0x02
+
+/* TWL4030_REG_ADCMICSEL (0x08) Fields */
+#define TWL4030_DIGMIC1_EN		0x08
+#define TWL4030_TX2IN_SEL		0x04
+#define TWL4030_DIGMIC0_EN		0x02
+#define TWL4030_TX1IN_SEL		0x01
+
+/* AUDIO_IF (0x0E) Fields */
+#define TWL4030_AIF_SLAVE_EN		0x80
+#define TWL4030_DATA_WIDTH		0x60
+#define TWL4030_DATA_WIDTH_16S_16W	0x00
+#define TWL4030_DATA_WIDTH_32S_16W	0x40
+#define TWL4030_DATA_WIDTH_32S_24W	0x60
+#define TWL4030_AIF_FORMAT		0x18
+#define TWL4030_AIF_FORMAT_CODEC	0x00
+#define TWL4030_AIF_FORMAT_LEFT		0x08
+#define TWL4030_AIF_FORMAT_RIGHT	0x10
+#define TWL4030_AIF_FORMAT_TDM		0x18
+#define TWL4030_AIF_TRI_EN		0x04
+#define TWL4030_CLK256FS_EN		0x02
+#define TWL4030_AIF_EN			0x01
+
+/* VOICE_IF (0x0F) Fields */
+#define TWL4030_VIF_SLAVE_EN		0x80
+#define TWL4030_VIF_DIN_EN		0x40
+#define TWL4030_VIF_DOUT_EN		0x20
+#define TWL4030_VIF_SWAP		0x10
+#define TWL4030_VIF_FORMAT		0x08
+#define TWL4030_VIF_TRI_EN		0x04
+#define TWL4030_VIF_SUB_EN		0x02
+#define TWL4030_VIF_EN			0x01
+
+/* EAR_CTL (0x21) */
+#define TWL4030_EAR_GAIN		0x30
+
+/* HS_GAIN_SET (0x23) Fields */
+#define TWL4030_HSR_GAIN		0x0C
+#define TWL4030_HSR_GAIN_PWR_DOWN	0x00
+#define TWL4030_HSR_GAIN_PLUS_6DB	0x04
+#define TWL4030_HSR_GAIN_0DB		0x08
+#define TWL4030_HSR_GAIN_MINUS_6DB	0x0C
+#define TWL4030_HSL_GAIN		0x03
+#define TWL4030_HSL_GAIN_PWR_DOWN	0x00
+#define TWL4030_HSL_GAIN_PLUS_6DB	0x01
+#define TWL4030_HSL_GAIN_0DB		0x02
+#define TWL4030_HSL_GAIN_MINUS_6DB	0x03
+
+/* HS_POPN_SET (0x24) Fields */
+#define TWL4030_VMID_EN			0x40
+#define	TWL4030_EXTMUTE			0x20
+#define TWL4030_RAMP_DELAY		0x1C
+#define TWL4030_RAMP_DELAY_20MS		0x00
+#define TWL4030_RAMP_DELAY_40MS		0x04
+#define TWL4030_RAMP_DELAY_81MS		0x08
+#define TWL4030_RAMP_DELAY_161MS	0x0C
+#define TWL4030_RAMP_DELAY_323MS	0x10
+#define TWL4030_RAMP_DELAY_645MS	0x14
+#define TWL4030_RAMP_DELAY_1291MS	0x18
+#define TWL4030_RAMP_DELAY_2581MS	0x1C
+#define TWL4030_RAMP_EN			0x02
+
+/* PREDL_CTL (0x25) */
+#define TWL4030_PREDL_GAIN		0x30
+
+/* PREDR_CTL (0x26) */
+#define TWL4030_PREDR_GAIN		0x30
+
+/* PRECKL_CTL (0x27) */
+#define TWL4030_PRECKL_GAIN		0x30
+
+/* PRECKR_CTL (0x28) */
+#define TWL4030_PRECKR_GAIN		0x30
+
+/* HFL_CTL (0x29, 0x2A) Fields */
+#define TWL4030_HF_CTL_HB_EN		0x04
+#define TWL4030_HF_CTL_LOOP_EN		0x08
+#define TWL4030_HF_CTL_RAMP_EN		0x10
+#define TWL4030_HF_CTL_REF_EN		0x20
+
+/* APLL_CTL (0x3A) Fields */
+#define TWL4030_APLL_EN			0x10
+#define TWL4030_APLL_INFREQ		0x0F
+#define TWL4030_APLL_INFREQ_19200KHZ	0x05
+#define TWL4030_APLL_INFREQ_26000KHZ	0x06
+#define TWL4030_APLL_INFREQ_38400KHZ	0x0F
+
+/* REG_MISC_SET_1 (0x3E) Fields */
+#define TWL4030_CLK64_EN		0x80
+#define TWL4030_SCRAMBLE_EN		0x40
+#define TWL4030_FMLOOP_EN		0x20
+#define TWL4030_SMOOTH_ANAVOL_EN	0x02
+#define TWL4030_DIGMIC_LR_SWAP_EN	0x01
+
+/* VIBRA_CTL (0x45) */
+#define TWL4030_VIBRA_EN		0x01
+#define TWL4030_VIBRA_DIR		0x02
+#define TWL4030_VIBRA_AUDIO_SEL_L1	(0x00 << 2)
+#define TWL4030_VIBRA_AUDIO_SEL_R1	(0x01 << 2)
+#define TWL4030_VIBRA_AUDIO_SEL_L2	(0x02 << 2)
+#define TWL4030_VIBRA_AUDIO_SEL_R2	(0x03 << 2)
+#define TWL4030_VIBRA_SEL		0x10
+#define TWL4030_VIBRA_DIR_SEL		0x20
+
+/* TWL4030 codec resource IDs */
+enum twl4030_codec_res {
+	TWL4030_CODEC_RES_POWER = 0,
+	TWL4030_CODEC_RES_APLL,
+	TWL4030_CODEC_RES_MAX,
+};
+
+int twl4030_codec_disable_resource(enum twl4030_codec_res id);
+int twl4030_codec_enable_resource(enum twl4030_codec_res id);
+
+#endif	/* End of __TWL4030_CODEC_H__ */
-- 
cgit v1.2.3


From 9b1d82fa1611706fa7ee1505f290160a18caf95d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 25 Oct 2009 19:03:50 -0700
Subject: rcu: "Tiny RCU", The Bloatwatch Edition

This patch is a version of RCU designed for !SMP provided for a
small-footprint RCU implementation.  In particular, the
implementation of synchronize_rcu() is extremely lightweight and
high performance. It passes rcutorture testing in each of the
four relevant configurations (combinations of NO_HZ and PREEMPT)
on x86.  This saves about 1K bytes compared to old Classic RCU
(which is no longer in mainline), and more than three kilobytes
compared to Hierarchical RCU (updated to 2.6.30):

	CONFIG_TREE_RCU:

	   text	   data	    bss	    dec	    filename
	    183       4       0     187     kernel/rcupdate.o
	   2783     520      36    3339     kernel/rcutree.o
				   3526 Total (vs 4565 for v7)

	CONFIG_TREE_PREEMPT_RCU:

	   text	   data	    bss	    dec	    filename
	    263       4       0     267     kernel/rcupdate.o
	   4594     776      52    5422     kernel/rcutree.o
	   			   5689 Total (6155 for v7)

	CONFIG_TINY_RCU:

	   text	   data	    bss	    dec	    filename
	     96       4       0     100     kernel/rcupdate.o
	    734      24       0     758     kernel/rcutiny.o
	    			    858 Total (vs 848 for v7)

The above is for x86.  Your mileage may vary on other platforms.
Further compression is possible, but is being procrastinated.

Changes from v7 (http://lkml.org/lkml/2009/10/9/388)

o	Apply Lai Jiangshan's review comments (aside from
might_sleep() 	in synchronize_sched(), which is covered by SMP builds).

o	Fix up expedited primitives.

Changes from v6 (http://lkml.org/lkml/2009/9/23/293).

o	Forward ported to put it into the 2.6.33 stream.

o	Added lockdep support.

o	Make lightweight rcu_barrier.

Changes from v5 (http://lkml.org/lkml/2009/6/23/12).

o	Ported to latest pre-2.6.32 merge window kernel.

	- Renamed rcu_qsctr_inc() to rcu_sched_qs().
	- Renamed rcu_bh_qsctr_inc() to rcu_bh_qs().
	- Provided trivial rcu_cpu_notify().
	- Provided trivial exit_rcu().
	- Provided trivial rcu_needs_cpu().
	- Fixed up the rcu_*_enter/exit() functions in linux/hardirq.h.

o	Removed the dependence on EMBEDDED, with a view to making
	TINY_RCU default for !SMP at some time in the future.

o	Added (trivial) support for expedited grace periods.

Changes from v4 (http://lkml.org/lkml/2009/5/2/91) include:

o	Squeeze the size down a bit further by removing the
	->completed field from struct rcu_ctrlblk.

o	This permits synchronize_rcu() to become the empty function.
	Previous concerns about rcutorture were unfounded, as
	rcutorture correctly handles a constant value from
	rcu_batches_completed() and rcu_batches_completed_bh().

Changes from v3 (http://lkml.org/lkml/2009/3/29/221) include:

o	Changed rcu_batches_completed(), rcu_batches_completed_bh()
	rcu_enter_nohz(), rcu_exit_nohz(), rcu_nmi_enter(), and
	rcu_nmi_exit(), to be static inlines, as suggested by David
	Howells.  Doing this saves about 100 bytes from rcutiny.o.
	(The numbers between v3 and this v4 of the patch are not directly
	comparable, since they are against different versions of Linux.)

Changes from v2 (http://lkml.org/lkml/2009/2/3/333) include:

o	Fix whitespace issues.

o	Change short-circuit "||" operator to instead be "+" in order
to 	fix performance bug noted by "kraai" on LWN.

		(http://lwn.net/Articles/324348/)

Changes from v1 (http://lkml.org/lkml/2009/1/13/440) include:

o	This version depends on EMBEDDED as well as !SMP, as suggested
	by Ingo.

o	Updated rcu_needs_cpu() to unconditionally return zero,
	permitting the CPU to enter dynticks-idle mode at any time.
	This works because callbacks can be invoked upon entry to
	dynticks-idle mode.

o	Paul is now OK with this being included, based on a poll at
the 	Kernel Miniconf at linux.conf.au, where about ten people said
	that they cared about saving 900 bytes on single-CPU systems.

o	Applies to both mainline and tip/core/rcu.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: David Howells <dhowells@redhat.com>
Acked-by: Josh Triplett <josh@joshtriplett.org>
Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: avi@redhat.com
Cc: mtosatti@redhat.com
LKML-Reference: <12565226351355-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/hardirq.h  |  24 ++++
 include/linux/rcupdate.h |   6 +
 include/linux/rcutiny.h  |  97 ++++++++++++++++
 init/Kconfig             |   9 ++
 kernel/Makefile          |   1 +
 kernel/rcupdate.c        |   4 +
 kernel/rcutiny.c         | 294 +++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 435 insertions(+)
 create mode 100644 include/linux/rcutiny.h
 create mode 100644 kernel/rcutiny.c

(limited to 'include/linux')

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 6d527ee82b2b..d5b387669dab 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -139,10 +139,34 @@ static inline void account_system_vtime(struct task_struct *tsk)
 #endif
 
 #if defined(CONFIG_NO_HZ)
+#if defined(CONFIG_TINY_RCU)
+extern void rcu_enter_nohz(void);
+extern void rcu_exit_nohz(void);
+
+static inline void rcu_irq_enter(void)
+{
+	rcu_exit_nohz();
+}
+
+static inline void rcu_irq_exit(void)
+{
+	rcu_enter_nohz();
+}
+
+static inline void rcu_nmi_enter(void)
+{
+}
+
+static inline void rcu_nmi_exit(void)
+{
+}
+
+#else
 extern void rcu_irq_enter(void);
 extern void rcu_irq_exit(void);
 extern void rcu_nmi_enter(void);
 extern void rcu_nmi_exit(void);
+#endif
 #else
 # define rcu_irq_enter() do { } while (0)
 # define rcu_irq_exit() do { } while (0)
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 3ebd0b7bcb08..6dd71fa48429 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -68,11 +68,17 @@ extern int sched_expedited_torture_stats(char *page);
 /* Internal to kernel */
 extern void rcu_init(void);
 extern void rcu_scheduler_starting(void);
+#ifndef CONFIG_TINY_RCU
 extern int rcu_needs_cpu(int cpu);
+#else
+static inline int rcu_needs_cpu(int cpu) { return 0; }
+#endif
 extern int rcu_scheduler_active;
 
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
 #include <linux/rcutree.h>
+#elif CONFIG_TINY_RCU
+#include <linux/rcutiny.h>
 #else
 #error "Unknown RCU implementation specified to kernel configuration"
 #endif
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
new file mode 100644
index 000000000000..891073c264dc
--- /dev/null
+++ b/include/linux/rcutiny.h
@@ -0,0 +1,97 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU
+ */
+
+#ifndef __LINUX_TINY_H
+#define __LINUX_TINY_H
+
+#include <linux/cache.h>
+
+void rcu_sched_qs(int cpu);
+void rcu_bh_qs(int cpu);
+
+#define __rcu_read_lock()	preempt_disable()
+#define __rcu_read_unlock()	preempt_enable()
+#define __rcu_read_lock_bh()	local_bh_disable()
+#define __rcu_read_unlock_bh()	local_bh_enable()
+#define call_rcu_sched		call_rcu
+
+#define rcu_init_sched()	do { } while (0)
+extern void rcu_check_callbacks(int cpu, int user);
+extern void __rcu_init(void);
+
+/*
+ * Return the number of grace periods.
+ */
+static inline long rcu_batches_completed(void)
+{
+	return 0;
+}
+
+/*
+ * Return the number of bottom-half grace periods.
+ */
+static inline long rcu_batches_completed_bh(void)
+{
+	return 0;
+}
+
+extern int rcu_expedited_torture_stats(char *page);
+
+static inline void synchronize_rcu_expedited(void)
+{
+	synchronize_sched();
+}
+
+static inline void synchronize_rcu_bh_expedited(void)
+{
+	synchronize_sched();
+}
+
+struct notifier_block;
+extern int rcu_cpu_notify(struct notifier_block *self,
+			  unsigned long action, void *hcpu);
+
+#ifdef CONFIG_NO_HZ
+
+extern void rcu_enter_nohz(void);
+extern void rcu_exit_nohz(void);
+
+#else /* #ifdef CONFIG_NO_HZ */
+
+static inline void rcu_enter_nohz(void)
+{
+}
+
+static inline void rcu_exit_nohz(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_NO_HZ */
+
+static inline void exit_rcu(void)
+{
+}
+
+#endif /* __LINUX_RCUTINY_H */
diff --git a/init/Kconfig b/init/Kconfig
index 09c5c6431f42..d367bf6bcc34 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -334,6 +334,15 @@ config TREE_PREEMPT_RCU
 	  is also required.  It also scales down nicely to
 	  smaller systems.
 
+config TINY_RCU
+	bool "UP-only small-memory-footprint RCU"
+	depends on !SMP
+	help
+	  This option selects the RCU implementation that is
+	  designed for UP systems from which real-time response
+	  is not required.  This option greatly reduces the
+	  memory footprint of RCU.
+
 endchoice
 
 config RCU_TRACE
diff --git a/kernel/Makefile b/kernel/Makefile
index b8d4cd8ac0b9..e9a0e6ed25cc 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -82,6 +82,7 @@ obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_TREE_RCU) += rcutree.o
 obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
 obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
+obj-$(CONFIG_TINY_RCU) += rcutiny.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 400183346ad2..7625f207f65e 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -67,6 +67,8 @@ void wakeme_after_rcu(struct rcu_head  *head)
 	complete(&rcu->completion);
 }
 
+#ifndef CONFIG_TINY_RCU
+
 #ifdef CONFIG_TREE_PREEMPT_RCU
 
 /**
@@ -157,6 +159,8 @@ void synchronize_rcu_bh(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
 
+#endif /* #ifndef CONFIG_TINY_RCU */
+
 static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
 		unsigned long action, void *hcpu)
 {
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
new file mode 100644
index 000000000000..0b54efde66ac
--- /dev/null
+++ b/kernel/rcutiny.c
@@ -0,0 +1,294 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/time.h>
+
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+	struct rcu_head *rcucblist;	/* List of pending callbacks (CBs). */
+	struct rcu_head **donetail;	/* ->next pointer of last "done" CB. */
+	struct rcu_head **curtail;	/* ->next pointer of last CB. */
+};
+
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_ctrlblk = {
+	.rcucblist = NULL,
+	.donetail = &rcu_ctrlblk.rcucblist,
+	.curtail = &rcu_ctrlblk.rcucblist,
+};
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+	.rcucblist = NULL,
+	.donetail = &rcu_bh_ctrlblk.rcucblist,
+	.curtail = &rcu_bh_ctrlblk.rcucblist,
+};
+
+#ifdef CONFIG_NO_HZ
+
+static long rcu_dynticks_nesting = 1;
+
+/*
+ * Enter dynticks-idle mode, which is an extended quiescent state
+ * if we have fully entered that mode (i.e., if the new value of
+ * dynticks_nesting is zero).
+ */
+void rcu_enter_nohz(void)
+{
+	if (--rcu_dynticks_nesting == 0)
+		rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
+}
+
+/*
+ * Exit dynticks-idle mode, so that we are no longer in an extended
+ * quiescent state.
+ */
+void rcu_exit_nohz(void)
+{
+	rcu_dynticks_nesting++;
+}
+
+#endif /* #ifdef CONFIG_NO_HZ */
+
+/*
+ * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc().
+ * Also disable irqs to avoid confusion due to interrupt handlers invoking
+ * call_rcu().
+ */
+static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	if (rcp->rcucblist != NULL &&
+	    rcp->donetail != rcp->curtail) {
+		rcp->donetail = rcp->curtail;
+		local_irq_restore(flags);
+		return 1;
+	}
+	local_irq_restore(flags);
+	return 0;
+}
+
+/*
+ * Record an rcu quiescent state.  And an rcu_bh quiescent state while we
+ * are at it, given that any rcu quiescent state is also an rcu_bh
+ * quiescent state.  Use "+" instead of "||" to defeat short circuiting.
+ */
+void rcu_sched_qs(int cpu)
+{
+	if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk))
+		raise_softirq(RCU_SOFTIRQ);
+}
+
+/*
+ * Record an rcu_bh quiescent state.
+ */
+void rcu_bh_qs(int cpu)
+{
+	if (rcu_qsctr_help(&rcu_bh_ctrlblk))
+		raise_softirq(RCU_SOFTIRQ);
+}
+
+/*
+ * Check to see if the scheduling-clock interrupt came from an extended
+ * quiescent state, and, if so, tell RCU about it.
+ */
+void rcu_check_callbacks(int cpu, int user)
+{
+	if (user ||
+	    (idle_cpu(cpu) &&
+	     !in_softirq() &&
+	     hardirq_count() <= (1 << HARDIRQ_SHIFT)))
+		rcu_sched_qs(cpu);
+	else if (!in_softirq())
+		rcu_bh_qs(cpu);
+}
+
+/*
+ * Helper function for rcu_process_callbacks() that operates on the
+ * specified rcu_ctrlkblk structure.
+ */
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+{
+	unsigned long flags;
+	struct rcu_head *next, *list;
+
+	/* If no RCU callbacks ready to invoke, just return. */
+	if (&rcp->rcucblist == rcp->donetail)
+		return;
+
+	/* Move the ready-to-invoke callbacks to a local list. */
+	local_irq_save(flags);
+	list = rcp->rcucblist;
+	rcp->rcucblist = *rcp->donetail;
+	*rcp->donetail = NULL;
+	if (rcp->curtail == rcp->donetail)
+		rcp->curtail = &rcp->rcucblist;
+	rcp->donetail = &rcp->rcucblist;
+	local_irq_restore(flags);
+
+	/* Invoke the callbacks on the local list. */
+	while (list) {
+		next = list->next;
+		prefetch(next);
+		list->func(list);
+		list = next;
+	}
+}
+
+/*
+ * Invoke any callbacks whose grace period has completed.
+ */
+static void rcu_process_callbacks(struct softirq_action *unused)
+{
+	__rcu_process_callbacks(&rcu_ctrlblk);
+	__rcu_process_callbacks(&rcu_bh_ctrlblk);
+}
+
+/*
+ * Null function to handle CPU being onlined.  Longer term, we want to
+ * make TINY_RCU avoid using rcupdate.c, but later...
+ */
+int rcu_cpu_notify(struct notifier_block *self,
+		   unsigned long action, void *hcpu)
+{
+	return NOTIFY_OK;
+}
+
+/*
+ * Wait for a grace period to elapse.  But it is illegal to invoke
+ * synchronize_sched() from within an RCU read-side critical section.
+ * Therefore, any legal call to synchronize_sched() is a quiescent
+ * state, and so on a UP system, synchronize_sched() need do nothing.
+ * Ditto for synchronize_rcu_bh().  (But Lai Jiangshan points out the
+ * benefits of doing might_sleep() to reduce latency.)
+ *
+ * Cool, huh?  (Due to Josh Triplett.)
+ *
+ * But we want to make this a static inline later.
+ */
+void synchronize_sched(void)
+{
+	cond_resched();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched);
+
+void synchronize_rcu_bh(void)
+{
+	synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
+
+/*
+ * Helper function for call_rcu() and call_rcu_bh().
+ */
+static void __call_rcu(struct rcu_head *head,
+		       void (*func)(struct rcu_head *rcu),
+		       struct rcu_ctrlblk *rcp)
+{
+	unsigned long flags;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	*rcp->curtail = head;
+	rcp->curtail = &head->next;
+	local_irq_restore(flags);
+}
+
+/*
+ * Post an RCU callback to be invoked after the end of an RCU grace
+ * period.  But since we have but one CPU, that would be after any
+ * quiescent state.
+ */
+void call_rcu(struct rcu_head *head,
+	      void (*func)(struct rcu_head *rcu))
+{
+	__call_rcu(head, func, &rcu_ctrlblk);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/*
+ * Post an RCU bottom-half callback to be invoked after any subsequent
+ * quiescent state.
+ */
+void call_rcu_bh(struct rcu_head *head,
+		 void (*func)(struct rcu_head *rcu))
+{
+	__call_rcu(head, func, &rcu_bh_ctrlblk);
+}
+EXPORT_SYMBOL_GPL(call_rcu_bh);
+
+void rcu_barrier(void)
+{
+	struct rcu_synchronize rcu;
+
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished. */
+	call_rcu(&rcu.head, wakeme_after_rcu);
+	/* Wait for it. */
+	wait_for_completion(&rcu.completion);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier);
+
+void rcu_barrier_bh(void)
+{
+	struct rcu_synchronize rcu;
+
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished. */
+	call_rcu_bh(&rcu.head, wakeme_after_rcu);
+	/* Wait for it. */
+	wait_for_completion(&rcu.completion);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_bh);
+
+void rcu_barrier_sched(void)
+{
+	struct rcu_synchronize rcu;
+
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished. */
+	call_rcu_sched(&rcu.head, wakeme_after_rcu);
+	/* Wait for it. */
+	wait_for_completion(&rcu.completion);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_sched);
+
+void __rcu_init(void)
+{
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+}
-- 
cgit v1.2.3


From 0cd397d33608ae6c97d2ee6c8c43462b419b7e26 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 25 Oct 2009 19:03:51 -0700
Subject: rcu: Add synchronize_srcu_expedited()

This patch creates a synchronize_srcu_expedited() that uses
synchronize_sched_expedited() where synchronize_srcu()
uses synchronize_sched().  The synchronize_srcu() and
synchronize_srcu_expedited() functions become one-liners that
pass synchronize_sched() or synchronize_sched_expedited(),
repectively, to a new __synchronize_srcu() function.

While in the file, move the EXPORT_SYMBOL_GPL()s to immediately
follow the corresponding functions.

Requested-by: Avi Kivity <avi@redhat.com>
Tested-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Josh Triplett <josh@joshtriplett.org>
Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
Cc: avi@redhat.com
LKML-Reference: <12565226354038-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/srcu.h |  1 +
 kernel/srcu.c        | 74 ++++++++++++++++++++++++++++++++++++----------------
 2 files changed, 52 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index aca0eee53930..4765d97dcafb 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -48,6 +48,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp);
 int srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
 void srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
 void synchronize_srcu(struct srcu_struct *sp);
+void synchronize_srcu_expedited(struct srcu_struct *sp);
 long srcu_batches_completed(struct srcu_struct *sp);
 
 #endif
diff --git a/kernel/srcu.c b/kernel/srcu.c
index b0aeeaf22ce4..818d7d9aa03c 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -49,6 +49,7 @@ int init_srcu_struct(struct srcu_struct *sp)
 	sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
 	return (sp->per_cpu_ref ? 0 : -ENOMEM);
 }
+EXPORT_SYMBOL_GPL(init_srcu_struct);
 
 /*
  * srcu_readers_active_idx -- returns approximate number of readers
@@ -97,6 +98,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
 	free_percpu(sp->per_cpu_ref);
 	sp->per_cpu_ref = NULL;
 }
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
 
 /**
  * srcu_read_lock - register a new reader for an SRCU-protected structure.
@@ -118,6 +120,7 @@ int srcu_read_lock(struct srcu_struct *sp)
 	preempt_enable();
 	return idx;
 }
+EXPORT_SYMBOL_GPL(srcu_read_lock);
 
 /**
  * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
@@ -136,22 +139,12 @@ void srcu_read_unlock(struct srcu_struct *sp, int idx)
 	per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
 	preempt_enable();
 }
+EXPORT_SYMBOL_GPL(srcu_read_unlock);
 
-/**
- * synchronize_srcu - wait for prior SRCU read-side critical-section completion
- * @sp: srcu_struct with which to synchronize.
- *
- * Flip the completed counter, and wait for the old count to drain to zero.
- * As with classic RCU, the updater must use some separate means of
- * synchronizing concurrent updates.  Can block; must be called from
- * process context.
- *
- * Note that it is illegal to call synchornize_srcu() from the corresponding
- * SRCU read-side critical section; doing so will result in deadlock.
- * However, it is perfectly legal to call synchronize_srcu() on one
- * srcu_struct from some other srcu_struct's read-side critical section.
+/*
+ * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
  */
-void synchronize_srcu(struct srcu_struct *sp)
+void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
 {
 	int idx;
 
@@ -173,7 +166,7 @@ void synchronize_srcu(struct srcu_struct *sp)
 		return;
 	}
 
-	synchronize_sched();  /* Force memory barrier on all CPUs. */
+	sync_func();  /* Force memory barrier on all CPUs. */
 
 	/*
 	 * The preceding synchronize_sched() ensures that any CPU that
@@ -190,7 +183,7 @@ void synchronize_srcu(struct srcu_struct *sp)
 	idx = sp->completed & 0x1;
 	sp->completed++;
 
-	synchronize_sched();  /* Force memory barrier on all CPUs. */
+	sync_func();  /* Force memory barrier on all CPUs. */
 
 	/*
 	 * At this point, because of the preceding synchronize_sched(),
@@ -203,7 +196,7 @@ void synchronize_srcu(struct srcu_struct *sp)
 	while (srcu_readers_active_idx(sp, idx))
 		schedule_timeout_interruptible(1);
 
-	synchronize_sched();  /* Force memory barrier on all CPUs. */
+	sync_func();  /* Force memory barrier on all CPUs. */
 
 	/*
 	 * The preceding synchronize_sched() forces all srcu_read_unlock()
@@ -236,6 +229,47 @@ void synchronize_srcu(struct srcu_struct *sp)
 	mutex_unlock(&sp->mutex);
 }
 
+/**
+ * synchronize_srcu - wait for prior SRCU read-side critical-section completion
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Flip the completed counter, and wait for the old count to drain to zero.
+ * As with classic RCU, the updater must use some separate means of
+ * synchronizing concurrent updates.  Can block; must be called from
+ * process context.
+ *
+ * Note that it is illegal to call synchronize_srcu() from the corresponding
+ * SRCU read-side critical section; doing so will result in deadlock.
+ * However, it is perfectly legal to call synchronize_srcu() on one
+ * srcu_struct from some other srcu_struct's read-side critical section.
+ */
+void synchronize_srcu(struct srcu_struct *sp)
+{
+	__synchronize_srcu(sp, synchronize_sched);
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu);
+
+/**
+ * synchronize_srcu_expedited - like synchronize_srcu, but less patient
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Flip the completed counter, and wait for the old count to drain to zero.
+ * As with classic RCU, the updater must use some separate means of
+ * synchronizing concurrent updates.  Can block; must be called from
+ * process context.
+ *
+ * Note that it is illegal to call synchronize_srcu_expedited()
+ * from the corresponding SRCU read-side critical section; doing so
+ * will result in deadlock.  However, it is perfectly legal to call
+ * synchronize_srcu_expedited() on one srcu_struct from some other
+ * srcu_struct's read-side critical section.
+ */
+void synchronize_srcu_expedited(struct srcu_struct *sp)
+{
+	__synchronize_srcu(sp, synchronize_sched_expedited);
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
+
 /**
  * srcu_batches_completed - return batches completed.
  * @sp: srcu_struct on which to report batch completion.
@@ -248,10 +282,4 @@ long srcu_batches_completed(struct srcu_struct *sp)
 {
 	return sp->completed;
 }
-
-EXPORT_SYMBOL_GPL(init_srcu_struct);
-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
-EXPORT_SYMBOL_GPL(srcu_read_lock);
-EXPORT_SYMBOL_GPL(srcu_read_unlock);
-EXPORT_SYMBOL_GPL(synchronize_srcu);
 EXPORT_SYMBOL_GPL(srcu_batches_completed);
-- 
cgit v1.2.3


From 4ce5b90340879ce93d169b7b523c2cbbe7c45843 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 26 Oct 2009 07:55:55 +0100
Subject: rcu: Do tiny cleanups in rcutiny

No change in functionality - just straighten out a few small
stylistic details.

Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: avi@redhat.com
Cc: mtosatti@redhat.com
LKML-Reference: <12565226351355-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcutiny.h |  6 ++----
 kernel/rcutiny.c        | 49 +++++++++++++++++++++++--------------------------
 2 files changed, 25 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 891073c264dc..2c1fe8373e71 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -20,9 +20,8 @@
  * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  *
  * For detailed explanation of Read-Copy Update mechanism see -
- * 		Documentation/RCU
+ *		Documentation/RCU
  */
-
 #ifndef __LINUX_TINY_H
 #define __LINUX_TINY_H
 
@@ -70,8 +69,7 @@ static inline void synchronize_rcu_bh_expedited(void)
 }
 
 struct notifier_block;
-extern int rcu_cpu_notify(struct notifier_block *self,
-			  unsigned long action, void *hcpu);
+extern int rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu);
 
 #ifdef CONFIG_NO_HZ
 
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 0b54efde66ac..b33ec3aa377a 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -20,22 +20,21 @@
  * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  *
  * For detailed explanation of Read-Copy Update mechanism see -
- * 		Documentation/RCU
+ *		Documentation/RCU
  */
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/rcupdate.h>
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <linux/module.h>
-#include <linux/completion.h>
 #include <linux/moduleparam.h>
+#include <linux/completion.h>
+#include <linux/interrupt.h>
 #include <linux/notifier.h>
-#include <linux/cpu.h>
+#include <linux/rcupdate.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/init.h>
 #include <linux/time.h>
+#include <linux/cpu.h>
 
 /* Global control variables for rcupdate callback mechanism. */
 struct rcu_ctrlblk {
@@ -46,14 +45,13 @@ struct rcu_ctrlblk {
 
 /* Definition for rcupdate control block. */
 static struct rcu_ctrlblk rcu_ctrlblk = {
-	.rcucblist = NULL,
-	.donetail = &rcu_ctrlblk.rcucblist,
-	.curtail = &rcu_ctrlblk.rcucblist,
+	.donetail	= &rcu_ctrlblk.rcucblist,
+	.curtail	= &rcu_ctrlblk.rcucblist,
 };
+
 static struct rcu_ctrlblk rcu_bh_ctrlblk = {
-	.rcucblist = NULL,
-	.donetail = &rcu_bh_ctrlblk.rcucblist,
-	.curtail = &rcu_bh_ctrlblk.rcucblist,
+	.donetail	= &rcu_bh_ctrlblk.rcucblist,
+	.curtail	= &rcu_bh_ctrlblk.rcucblist,
 };
 
 #ifdef CONFIG_NO_HZ
@@ -84,8 +82,8 @@ void rcu_exit_nohz(void)
 
 /*
  * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc().
- * Also disable irqs to avoid confusion due to interrupt handlers invoking
- * call_rcu().
+ * Also disable irqs to avoid confusion due to interrupt handlers
+ * invoking call_rcu().
  */
 static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
 {
@@ -99,6 +97,7 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
 		return 1;
 	}
 	local_irq_restore(flags);
+
 	return 0;
 }
 
@@ -143,8 +142,8 @@ void rcu_check_callbacks(int cpu, int user)
  */
 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 {
-	unsigned long flags;
 	struct rcu_head *next, *list;
+	unsigned long flags;
 
 	/* If no RCU callbacks ready to invoke, just return. */
 	if (&rcp->rcucblist == rcp->donetail)
@@ -182,8 +181,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
  * Null function to handle CPU being onlined.  Longer term, we want to
  * make TINY_RCU avoid using rcupdate.c, but later...
  */
-int rcu_cpu_notify(struct notifier_block *self,
-		   unsigned long action, void *hcpu)
+int rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 {
 	return NOTIFY_OK;
 }
@@ -223,6 +221,7 @@ static void __call_rcu(struct rcu_head *head,
 
 	head->func = func;
 	head->next = NULL;
+
 	local_irq_save(flags);
 	*rcp->curtail = head;
 	rcp->curtail = &head->next;
@@ -234,8 +233,7 @@ static void __call_rcu(struct rcu_head *head,
  * period.  But since we have but one CPU, that would be after any
  * quiescent state.
  */
-void call_rcu(struct rcu_head *head,
-	      void (*func)(struct rcu_head *rcu))
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
 	__call_rcu(head, func, &rcu_ctrlblk);
 }
@@ -245,8 +243,7 @@ EXPORT_SYMBOL_GPL(call_rcu);
  * Post an RCU bottom-half callback to be invoked after any subsequent
  * quiescent state.
  */
-void call_rcu_bh(struct rcu_head *head,
-		 void (*func)(struct rcu_head *rcu))
+void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
 	__call_rcu(head, func, &rcu_bh_ctrlblk);
 }
-- 
cgit v1.2.3


From b2c18e1e08a5a9663094d57bb4be2f02226ee61c Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Fri, 23 Oct 2009 17:14:49 -0400
Subject: cfq: calculate the seek_mean per cfq_queue not per cfq_io_context

async cfq_queue's are already shared between processes within the same
priority, and forthcoming patches will change the mapping of cic to sync
cfq_queue from 1:1 to 1:N.  So, calculate the seekiness of a process
based on the cfq_queue instead of the cfq_io_context.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c       | 68 +++++++++++++++++++++++------------------------
 include/linux/iocontext.h |  5 ----
 2 files changed, 33 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 069a61017c02..78cc8ee5da41 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -112,6 +112,11 @@ struct cfq_queue {
 	unsigned short ioprio, org_ioprio;
 	unsigned short ioprio_class, org_ioprio_class;
 
+	unsigned int seek_samples;
+	u64 seek_total;
+	sector_t seek_mean;
+	sector_t last_request_pos;
+
 	pid_t pid;
 };
 
@@ -962,16 +967,16 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
 		return cfqd->last_position - blk_rq_pos(rq);
 }
 
-#define CIC_SEEK_THR	8 * 1024
-#define CIC_SEEKY(cic)	((cic)->seek_mean > CIC_SEEK_THR)
+#define CFQQ_SEEK_THR		8 * 1024
+#define CFQQ_SEEKY(cfqq)	((cfqq)->seek_mean > CFQQ_SEEK_THR)
 
-static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
+static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+			       struct request *rq)
 {
-	struct cfq_io_context *cic = cfqd->active_cic;
-	sector_t sdist = cic->seek_mean;
+	sector_t sdist = cfqq->seek_mean;
 
-	if (!sample_valid(cic->seek_samples))
-		sdist = CIC_SEEK_THR;
+	if (!sample_valid(cfqq->seek_samples))
+		sdist = CFQQ_SEEK_THR;
 
 	return cfq_dist_from_last(cfqd, rq) <= sdist;
 }
@@ -1000,7 +1005,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
 	 * will contain the closest sector.
 	 */
 	__cfqq = rb_entry(parent, struct cfq_queue, p_node);
-	if (cfq_rq_close(cfqd, __cfqq->next_rq))
+	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
 		return __cfqq;
 
 	if (blk_rq_pos(__cfqq->next_rq) < sector)
@@ -1011,7 +1016,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
 		return NULL;
 
 	__cfqq = rb_entry(node, struct cfq_queue, p_node);
-	if (cfq_rq_close(cfqd, __cfqq->next_rq))
+	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
 		return __cfqq;
 
 	return NULL;
@@ -1033,13 +1038,6 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
 {
 	struct cfq_queue *cfqq;
 
-	/*
-	 * A valid cfq_io_context is necessary to compare requests against
-	 * the seek_mean of the current cfqq.
-	 */
-	if (!cfqd->active_cic)
-		return NULL;
-
 	/*
 	 * We should notice if some of the queues are cooperating, eg
 	 * working closely on the same area of the disk. In that case,
@@ -1110,7 +1108,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	 * seeks. so allow a little bit of time for him to submit a new rq
 	 */
 	sl = cfqd->cfq_slice_idle;
-	if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
+	if (sample_valid(cfqq->seek_samples) && CFQQ_SEEKY(cfqq))
 		sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
 
 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
@@ -1947,33 +1945,33 @@ cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
 }
 
 static void
-cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
+cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		       struct request *rq)
 {
 	sector_t sdist;
 	u64 total;
 
-	if (!cic->last_request_pos)
+	if (!cfqq->last_request_pos)
 		sdist = 0;
-	else if (cic->last_request_pos < blk_rq_pos(rq))
-		sdist = blk_rq_pos(rq) - cic->last_request_pos;
+	else if (cfqq->last_request_pos < blk_rq_pos(rq))
+		sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
 	else
-		sdist = cic->last_request_pos - blk_rq_pos(rq);
+		sdist = cfqq->last_request_pos - blk_rq_pos(rq);
 
 	/*
 	 * Don't allow the seek distance to get too large from the
 	 * odd fragment, pagein, etc
 	 */
-	if (cic->seek_samples <= 60) /* second&third seek */
-		sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024);
+	if (cfqq->seek_samples <= 60) /* second&third seek */
+		sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*1024);
 	else
-		sdist = min(sdist, (cic->seek_mean * 4)	+ 2*1024*64);
+		sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*64);
 
-	cic->seek_samples = (7*cic->seek_samples + 256) / 8;
-	cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
-	total = cic->seek_total + (cic->seek_samples/2);
-	do_div(total, cic->seek_samples);
-	cic->seek_mean = (sector_t)total;
+	cfqq->seek_samples = (7*cfqq->seek_samples + 256) / 8;
+	cfqq->seek_total = (7*cfqq->seek_total + (u64)256*sdist) / 8;
+	total = cfqq->seek_total + (cfqq->seek_samples/2);
+	do_div(total, cfqq->seek_samples);
+	cfqq->seek_mean = (sector_t)total;
 }
 
 /*
@@ -1995,11 +1993,11 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
 
 	if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
-	    (!cfqd->cfq_latency && cfqd->hw_tag && CIC_SEEKY(cic)))
+	    (!cfqd->cfq_latency && cfqd->hw_tag && CFQQ_SEEKY(cfqq)))
 		enable_idle = 0;
 	else if (sample_valid(cic->ttime_samples)) {
 		unsigned int slice_idle = cfqd->cfq_slice_idle;
-		if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
+		if (sample_valid(cfqq->seek_samples) && CFQQ_SEEKY(cfqq))
 			slice_idle = msecs_to_jiffies(CFQ_MIN_TT);
 		if (cic->ttime_mean > slice_idle)
 			enable_idle = 0;
@@ -2066,7 +2064,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	 * if this request is as-good as one we would expect from the
 	 * current cfqq, let it preempt
 	 */
-	if (cfq_rq_close(cfqd, rq))
+	if (cfq_rq_close(cfqd, cfqq, rq))
 		return true;
 
 	return false;
@@ -2108,10 +2106,10 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		cfqq->meta_pending++;
 
 	cfq_update_io_thinktime(cfqd, cic);
-	cfq_update_io_seektime(cfqd, cic, rq);
+	cfq_update_io_seektime(cfqd, cfqq, rq);
 	cfq_update_idle_window(cfqd, cfqq, cic);
 
-	cic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
+	cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
 
 	if (cfqq == cfqd->active_queue) {
 		/*
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 4da4a75c3f1e..eb73632440f1 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -40,16 +40,11 @@ struct cfq_io_context {
 	struct io_context *ioc;
 
 	unsigned long last_end_request;
-	sector_t last_request_pos;
 
 	unsigned long ttime_total;
 	unsigned long ttime_samples;
 	unsigned long ttime_mean;
 
-	unsigned int seek_samples;
-	u64 seek_total;
-	sector_t seek_mean;
-
 	struct list_head queue_list;
 	struct hlist_node cic_list;
 
-- 
cgit v1.2.3


From 05423b241311c9380b7280179295bac7794281b6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 26 Oct 2009 18:40:35 -0700
Subject: vlan: allow null VLAN ID to be used

We currently use a 16 bit field (vlan_tci) to store VLAN ID/PRIO on a skb.

Null value is used as a special value, meaning vlan tagging not enabled.
This forbids use of null vlan ID.

As pointed by David, some drivers use the 3 high order bits (PRIO)

As VLAN ID is 12 bits, we can use the remaining bit (CFI) as a flag, and
allow null VLAN ID.

In case future code really wants to use VLAN_CFI_MASK, we'll have to use
a bit outside of vlan_tci.

#define VLAN_PRIO_MASK         0xe000 /* Priority Code Point */
#define VLAN_PRIO_SHIFT        13
#define VLAN_CFI_MASK          0x1000 /* Canonical Format Indicator */
#define VLAN_TAG_PRESENT       VLAN_CFI_MASK
#define VLAN_VID_MASK          0x0fff /* VLAN Identifier */

Reported-by: Gertjan Hofman <gertjan_hofman@yahoo.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 14 +++++++++-----
 net/8021q/vlan.h        |  2 +-
 net/8021q/vlan_dev.c    |  2 +-
 net/core/dev.c          |  2 +-
 net/packet/af_packet.c  |  5 +++--
 5 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 7ff9af1d0f05..8898cbebcf34 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -63,7 +63,11 @@ static inline struct vlan_ethhdr *vlan_eth_hdr(const struct sk_buff *skb)
 	return (struct vlan_ethhdr *)skb_mac_header(skb);
 }
 
-#define VLAN_VID_MASK	0xfff
+#define VLAN_PRIO_MASK		0xe000 /* Priority Code Point */
+#define VLAN_PRIO_SHIFT		13
+#define VLAN_CFI_MASK		0x1000 /* Canonical Format Indicator */
+#define VLAN_TAG_PRESENT	VLAN_CFI_MASK
+#define VLAN_VID_MASK		0x0fff /* VLAN Identifier */
 
 /* found in socket.c */
 extern void vlan_ioctl_set(int (*hook)(struct net *, void __user *));
@@ -105,8 +109,8 @@ static inline void vlan_group_set_device(struct vlan_group *vg,
 	array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] = dev;
 }
 
-#define vlan_tx_tag_present(__skb)	((__skb)->vlan_tci)
-#define vlan_tx_tag_get(__skb)		((__skb)->vlan_tci)
+#define vlan_tx_tag_present(__skb)	((__skb)->vlan_tci & VLAN_TAG_PRESENT)
+#define vlan_tx_tag_get(__skb)		((__skb)->vlan_tci & ~VLAN_TAG_PRESENT)
 
 #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
 extern struct net_device *vlan_dev_real_dev(const struct net_device *dev);
@@ -231,7 +235,7 @@ static inline struct sk_buff *__vlan_put_tag(struct sk_buff *skb, u16 vlan_tci)
 static inline struct sk_buff *__vlan_hwaccel_put_tag(struct sk_buff *skb,
 						     u16 vlan_tci)
 {
-	skb->vlan_tci = vlan_tci;
+	skb->vlan_tci = VLAN_TAG_PRESENT | vlan_tci;
 	return skb;
 }
 
@@ -284,7 +288,7 @@ static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb,
 					 u16 *vlan_tci)
 {
 	if (vlan_tx_tag_present(skb)) {
-		*vlan_tci = skb->vlan_tci;
+		*vlan_tci = vlan_tx_tag_get(skb);
 		return 0;
 	} else {
 		*vlan_tci = 0;
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 82570bc2a180..4ade5edf1033 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -89,7 +89,7 @@ static inline u32 vlan_get_ingress_priority(struct net_device *dev,
 {
 	struct vlan_dev_info *vip = vlan_dev_info(dev);
 
-	return vip->ingress_priority_map[(vlan_tci >> 13) & 0x7];
+	return vip->ingress_priority_map[(vlan_tci >> VLAN_PRIO_SHIFT) & 0x7];
 }
 
 #ifdef CONFIG_VLAN_8021Q_GVRP
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 4198ec5c8abc..e3701977f58d 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -393,7 +393,7 @@ int vlan_dev_set_egress_priority(const struct net_device *dev,
 	struct vlan_dev_info *vlan = vlan_dev_info(dev);
 	struct vlan_priority_tci_mapping *mp = NULL;
 	struct vlan_priority_tci_mapping *np;
-	u32 vlan_qos = (vlan_prio << 13) & 0xE000;
+	u32 vlan_qos = (vlan_prio << VLAN_PRIO_SHIFT) & VLAN_PRIO_MASK;
 
 	/* See if a priority mapping exists.. */
 	mp = vlan->egress_priority_map[skb_prio & 0xF];
diff --git a/net/core/dev.c b/net/core/dev.c
index e7bada1d5ed9..950c13fa60d2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2300,7 +2300,7 @@ int netif_receive_skb(struct sk_buff *skb)
 	if (!skb->tstamp.tv64)
 		net_timestamp(skb);
 
-	if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
+	if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
 		return NET_RX_SUCCESS;
 
 	/* if we've gotten here through NAPI, check netpoll */
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index ff752c606413..33e68f20ec61 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -79,6 +79,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/mutex.h>
+#include <linux/if_vlan.h>
 
 #ifdef CONFIG_INET
 #include <net/inet_common.h>
@@ -766,7 +767,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 			getnstimeofday(&ts);
 		h.h2->tp_sec = ts.tv_sec;
 		h.h2->tp_nsec = ts.tv_nsec;
-		h.h2->tp_vlan_tci = skb->vlan_tci;
+		h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
 		hdrlen = sizeof(*h.h2);
 		break;
 	default:
@@ -1493,7 +1494,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 		aux.tp_snaplen = skb->len;
 		aux.tp_mac = 0;
 		aux.tp_net = skb_network_offset(skb);
-		aux.tp_vlan_tci = skb->vlan_tci;
+		aux.tp_vlan_tci = vlan_tx_tag_get(skb);
 
 		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
 	}
-- 
cgit v1.2.3


From 2c28e2451dba2260e9f88811b29a7787db7e7616 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 26 Oct 2009 13:57:44 -0700
Subject: rcu: Fix TINY_RCU #elif condition

Some compilers are happy with "#elif CONFIG_RCU_TINY", while
others strongly prefer "#elif defined(CONFIG_RCU_TINY)".  Change
to the latter to make more compilers happy.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12565906642768-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcupdate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 6dd71fa48429..2f1bc42a3b82 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -77,7 +77,7 @@ extern int rcu_scheduler_active;
 
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
 #include <linux/rcutree.h>
-#elif CONFIG_TINY_RCU
+#elif defined(CONFIG_TINY_RCU)
 #include <linux/rcutiny.h>
 #else
 #error "Unknown RCU implementation specified to kernel configuration"
-- 
cgit v1.2.3


From 8b45499ccb8a93cd68b1a8766786c2f8ea991ae2 Mon Sep 17 00:00:00 2001
From: Michael Buesch <mb@bu3sch.de>
Date: Fri, 9 Oct 2009 20:32:10 +0200
Subject: ssb: Put host pointers into a union

This slightly shrinks the structure.

Signed-off-by: Michael Buesch <mb@bu3sch.de>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/ssb/driver_pcicore.c |  4 ++--
 include/linux/ssb/ssb.h      | 20 ++++++++++++--------
 2 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ssb/driver_pcicore.c b/drivers/ssb/driver_pcicore.c
index 538c570df337..f1dcd7969a5c 100644
--- a/drivers/ssb/driver_pcicore.c
+++ b/drivers/ssb/driver_pcicore.c
@@ -551,13 +551,13 @@ int ssb_pcicore_dev_irqvecs_enable(struct ssb_pcicore *pc,
 	might_sleep_if(pdev->id.coreid != SSB_DEV_PCI);
 
 	/* Enable interrupts for this device. */
-	if (bus->host_pci &&
-	    ((pdev->id.revision >= 6) || (pdev->id.coreid == SSB_DEV_PCIE))) {
+	if ((pdev->id.revision >= 6) || (pdev->id.coreid == SSB_DEV_PCIE)) {
 		u32 coremask;
 
 		/* Calculate the "coremask" for the device. */
 		coremask = (1 << dev->core_index);
 
+		SSB_WARN_ON(bus->bustype != SSB_BUSTYPE_PCI);
 		err = pci_read_config_dword(bus->host_pci, SSB_PCI_IRQMASK, &tmp);
 		if (err)
 			goto out;
diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h
index 3d0a9ff24f01..24f988547361 100644
--- a/include/linux/ssb/ssb.h
+++ b/include/linux/ssb/ssb.h
@@ -269,7 +269,8 @@ struct ssb_bus {
 
 	const struct ssb_bus_ops *ops;
 
-	/* The core in the basic address register window. (PCI bus only) */
+	/* The core currently mapped into the MMIO window.
+	 * Not valid on all host-buses. So don't use outside of SSB. */
 	struct ssb_device *mapped_device;
 	union {
 		/* Currently mapped PCMCIA segment. (bustype == SSB_BUSTYPE_PCMCIA only) */
@@ -281,14 +282,17 @@ struct ssb_bus {
 	 * On PCMCIA-host busses this is used to protect the whole MMIO access. */
 	spinlock_t bar_lock;
 
-	/* The bus this backplane is running on. */
+	/* The host-bus this backplane is running on. */
 	enum ssb_bustype bustype;
-	/* Pointer to the PCI bus (only valid if bustype == SSB_BUSTYPE_PCI). */
-	struct pci_dev *host_pci;
-	/* Pointer to the PCMCIA device (only if bustype == SSB_BUSTYPE_PCMCIA). */
-	struct pcmcia_device *host_pcmcia;
-	/* Pointer to the SDIO device (only if bustype == SSB_BUSTYPE_SDIO). */
-	struct sdio_func *host_sdio;
+	/* Pointers to the host-bus. Check bustype before using any of these pointers. */
+	union {
+		/* Pointer to the PCI bus (only valid if bustype == SSB_BUSTYPE_PCI). */
+		struct pci_dev *host_pci;
+		/* Pointer to the PCMCIA device (only if bustype == SSB_BUSTYPE_PCMCIA). */
+		struct pcmcia_device *host_pcmcia;
+		/* Pointer to the SDIO device (only if bustype == SSB_BUSTYPE_SDIO). */
+		struct sdio_func *host_sdio;
+	};
 
 	/* See enum ssb_quirks */
 	unsigned int quirks;
-- 
cgit v1.2.3


From d6ba452128178091dab7a04d54f7e66fdc32fb39 Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Mon, 26 Oct 2009 09:26:18 -0400
Subject: tpm add default function definitions

Add default tpm_pcr_read/extend function definitions required
by IMA/Kconfig changes.

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Reviewed-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/tpm.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index 3338b3f5c21a..8eaa8f83effb 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -31,5 +31,12 @@
 
 extern int tpm_pcr_read(u32 chip_num, int pcr_idx, u8 *res_buf);
 extern int tpm_pcr_extend(u32 chip_num, int pcr_idx, const u8 *hash);
+#else
+static inline int tpm_pcr_read(u32 chip_num, int pcr_idx, u8 *res_buf) {
+	return -ENODEV;
+}
+static inline int tpm_pcr_extend(u32 chip_num, int pcr_idx, const u8 *hash) {
+	return -ENODEV;
+}
 #endif
 #endif
-- 
cgit v1.2.3


From f7d7986060b2890fc26db6ab5203efbd33aa2497 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Sun, 18 Oct 2009 01:09:29 +0000
Subject: perf_event: Add alignment-faults and emulation-faults software events

Add two more software events that are common to many cpus.

Alignment faults: When a load or store is not aligned properly.

Emulation faults: When an instruction is emulated in software.

Both cause a very significant slowdown (100x or worse), so identifying and
fixing them is very important.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 include/linux/perf_counter.h   | 2 ++
 include/linux/perf_event.h     | 2 ++
 kernel/perf_event.c            | 2 ++
 tools/perf/design.txt          | 2 ++
 tools/perf/util/parse-events.c | 4 ++++
 5 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 7b7fbf433cff..d6b95d1e79f0 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -106,6 +106,8 @@ enum perf_sw_ids {
 	PERF_COUNT_SW_CPU_MIGRATIONS		= 4,
 	PERF_COUNT_SW_PAGE_FAULTS_MIN		= 5,
 	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
+	PERF_COUNT_SW_ALIGNMENT_FAULTS		= 7,
+	PERF_COUNT_SW_EMULATION_FAULTS		= 8,
 
 	PERF_COUNT_SW_MAX,			/* non-ABI */
 };
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2e6d95f97419..a33707a3a788 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -102,6 +102,8 @@ enum perf_sw_ids {
 	PERF_COUNT_SW_CPU_MIGRATIONS		= 4,
 	PERF_COUNT_SW_PAGE_FAULTS_MIN		= 5,
 	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
+	PERF_COUNT_SW_ALIGNMENT_FAULTS		= 7,
+	PERF_COUNT_SW_EMULATION_FAULTS		= 8,
 
 	PERF_COUNT_SW_MAX,			/* non-ABI */
 };
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 9d0b5c665883..0683b33cbb28 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4186,6 +4186,8 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
 	case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
 	case PERF_COUNT_SW_CONTEXT_SWITCHES:
 	case PERF_COUNT_SW_CPU_MIGRATIONS:
+	case PERF_COUNT_SW_ALIGNMENT_FAULTS:
+	case PERF_COUNT_SW_EMULATION_FAULTS:
 		if (!event->parent) {
 			atomic_inc(&perf_swevent_enabled[event_id]);
 			event->destroy = sw_perf_event_destroy;
diff --git a/tools/perf/design.txt b/tools/perf/design.txt
index fdd42a824c98..f000c30877ac 100644
--- a/tools/perf/design.txt
+++ b/tools/perf/design.txt
@@ -137,6 +137,8 @@ enum sw_event_ids {
 	PERF_COUNT_SW_CPU_MIGRATIONS	= 4,
 	PERF_COUNT_SW_PAGE_FAULTS_MIN	= 5,
 	PERF_COUNT_SW_PAGE_FAULTS_MAJ	= 6,
+	PERF_COUNT_SW_ALIGNMENT_FAULTS	= 7,
+	PERF_COUNT_SW_EMULATION_FAULTS	= 8,
 };
 
 Counters of the type PERF_TYPE_TRACEPOINT are available when the ftrace event
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 8cfb48cbbea0..34bd84423933 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -46,6 +46,8 @@ static struct event_symbol event_symbols[] = {
   { CSW(PAGE_FAULTS_MAJ),	"major-faults",		""		},
   { CSW(CONTEXT_SWITCHES),	"context-switches",	"cs"		},
   { CSW(CPU_MIGRATIONS),	"cpu-migrations",	"migrations"	},
+  { CSW(ALIGNMENT_FAULTS),	"alignment-faults",	""		},
+  { CSW(EMULATION_FAULTS),	"emulation-faults",	""		},
 };
 
 #define __PERF_EVENT_FIELD(config, name) \
@@ -74,6 +76,8 @@ static const char *sw_event_names[] = {
 	"CPU-migrations",
 	"minor-faults",
 	"major-faults",
+	"alignment-faults",
+	"emulation-faults",
 };
 
 #define MAX_ALIASES 8
-- 
cgit v1.2.3


From 1af60fbd759d31f565552fea315c2033947cfbe6 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Fri, 2 Oct 2009 18:56:53 -0400
Subject: block: get rid of the WRITE_ODIRECT flag

Hi,

The WRITE_ODIRECT flag is only used in one place, and that code path
happens to also call blk_run_address_space.  The introduction of this
flag, then, could result in the device being unplugged twice for every
I/O.

Further, with the batching changes in the next patch, we don't want an
O_DIRECT write to imply a queue unplug.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/direct-io.c     | 2 +-
 include/linux/fs.h | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 8b10b87dc01a..c86d35f142de 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1124,7 +1124,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	int acquire_i_mutex = 0;
 
 	if (rw & WRITE)
-		rw = WRITE_ODIRECT;
+		rw = WRITE_SYNC_PLUG;
 
 	if (bdev)
 		bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2620a8c63571..2f5fca4147c2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -129,7 +129,6 @@ struct inodes_stat_t {
  * WRITE_SYNC		Like WRITE_SYNC_PLUG, but also unplugs the device
  *			immediately after submission. The write equivalent
  *			of READ_SYNC.
- * WRITE_ODIRECT	Special case write for O_DIRECT only.
  * SWRITE_SYNC
  * SWRITE_SYNC_PLUG	Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer.
  *			See SWRITE.
@@ -151,7 +150,6 @@ struct inodes_stat_t {
 #define READ_META	(READ | (1 << BIO_RW_META))
 #define WRITE_SYNC_PLUG	(WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
 #define WRITE_SYNC	(WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
-#define WRITE_ODIRECT	(WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
 #define SWRITE_SYNC_PLUG	\
 			(SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
 #define SWRITE_SYNC	(SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
-- 
cgit v1.2.3


From 44a0873d52282f24b1894c58c0f157e0f626ddc9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 27 Oct 2009 07:03:04 +0000
Subject: net: Introduce unregister_netdevice_queue()

This patchs adds an unreg_list anchor to struct net_device, and
introduces an unregister_netdevice_queue() function, able to queue
a net_device to a list instead of immediately unregister it.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  9 ++++++++-
 net/core/dev.c            | 20 +++++++++++++-------
 2 files changed, 21 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 83800091a31a..0ded0a4768a0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -683,6 +683,7 @@ struct net_device
 
 	struct list_head	dev_list;
 	struct list_head	napi_list;
+	struct list_head	unreg_list;
 
 	/* Net device features */
 	unsigned long		features;
@@ -1116,7 +1117,13 @@ extern int		dev_close(struct net_device *dev);
 extern void		dev_disable_lro(struct net_device *dev);
 extern int		dev_queue_xmit(struct sk_buff *skb);
 extern int		register_netdevice(struct net_device *dev);
-extern void		unregister_netdevice(struct net_device *dev);
+extern void		unregister_netdevice_queue(struct net_device *dev,
+						   struct list_head *head);
+static inline void unregister_netdevice(struct net_device *dev)
+{
+	unregister_netdevice_queue(dev, NULL);
+}
+
 extern void		free_netdev(struct net_device *dev);
 extern void		synchronize_net(void);
 extern int 		register_netdevice_notifier(struct notifier_block *nb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 950c13fa60d2..ff94e2b8df7f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5245,25 +5245,31 @@ void synchronize_net(void)
 EXPORT_SYMBOL(synchronize_net);
 
 /**
- *	unregister_netdevice - remove device from the kernel
+ *	unregister_netdevice_queue - remove device from the kernel
  *	@dev: device
- *
+ *	@head: list
+
  *	This function shuts down a device interface and removes it
  *	from the kernel tables.
+ *	If head not NULL, device is queued to be unregistered later.
  *
  *	Callers must hold the rtnl semaphore.  You may want
  *	unregister_netdev() instead of this.
  */
 
-void unregister_netdevice(struct net_device *dev)
+void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
 {
 	ASSERT_RTNL();
 
-	rollback_registered(dev);
-	/* Finish processing unregister after unlock */
-	net_set_todo(dev);
+	if (head) {
+		list_add_tail(&dev->unreg_list, head);
+	} else {
+		rollback_registered(dev);
+		/* Finish processing unregister after unlock */
+		net_set_todo(dev);
+	}
 }
-EXPORT_SYMBOL(unregister_netdevice);
+EXPORT_SYMBOL(unregister_netdevice_queue);
 
 /**
  *	unregister_netdev - remove device from the kernel
-- 
cgit v1.2.3


From 9b5e383c11b08784eb0087617f880077982ef769 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 27 Oct 2009 07:04:19 +0000
Subject: net: Introduce unregister_netdevice_many()

Introduce rollback_registered_many() and unregister_netdevice_many()

rollback_registered_many() is able to perform necessary steps at device dismantle
time, factorizing two expensive synchronize_net() calls.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 +
 net/core/dev.c            | 97 +++++++++++++++++++++++++++++++----------------
 2 files changed, 66 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0ded0a4768a0..e7c227d7cb98 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1119,6 +1119,7 @@ extern int		dev_queue_xmit(struct sk_buff *skb);
 extern int		register_netdevice(struct net_device *dev);
 extern void		unregister_netdevice_queue(struct net_device *dev,
 						   struct list_head *head);
+extern void		unregister_netdevice_many(struct list_head *head);
 static inline void unregister_netdevice(struct net_device *dev)
 {
 	unregister_netdevice_queue(dev, NULL);
diff --git a/net/core/dev.c b/net/core/dev.c
index ff94e2b8df7f..04d3e3014020 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4637,59 +4637,76 @@ static void net_set_todo(struct net_device *dev)
 	list_add_tail(&dev->todo_list, &net_todo_list);
 }
 
-static void rollback_registered(struct net_device *dev)
+static void rollback_registered_many(struct list_head *head)
 {
+	struct net_device *dev;
+
 	BUG_ON(dev_boot_phase);
 	ASSERT_RTNL();
 
-	/* Some devices call without registering for initialization unwind. */
-	if (dev->reg_state == NETREG_UNINITIALIZED) {
-		printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
-				  "was registered\n", dev->name, dev);
+	list_for_each_entry(dev, head, unreg_list) {
+		/* Some devices call without registering
+		 * for initialization unwind.
+		 */
+		if (dev->reg_state == NETREG_UNINITIALIZED) {
+			pr_debug("unregister_netdevice: device %s/%p never "
+				 "was registered\n", dev->name, dev);
 
-		WARN_ON(1);
-		return;
-	}
+			WARN_ON(1);
+			return;
+		}
 
-	BUG_ON(dev->reg_state != NETREG_REGISTERED);
+		BUG_ON(dev->reg_state != NETREG_REGISTERED);
 
-	/* If device is running, close it first. */
-	dev_close(dev);
+		/* If device is running, close it first. */
+		dev_close(dev);
 
-	/* And unlink it from device chain. */
-	unlist_netdevice(dev);
+		/* And unlink it from device chain. */
+		unlist_netdevice(dev);
 
-	dev->reg_state = NETREG_UNREGISTERING;
+		dev->reg_state = NETREG_UNREGISTERING;
+	}
 
 	synchronize_net();
 
-	/* Shutdown queueing discipline. */
-	dev_shutdown(dev);
+	list_for_each_entry(dev, head, unreg_list) {
+		/* Shutdown queueing discipline. */
+		dev_shutdown(dev);
 
 
-	/* Notify protocols, that we are about to destroy
-	   this device. They should clean all the things.
-	*/
-	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+		/* Notify protocols, that we are about to destroy
+		   this device. They should clean all the things.
+		*/
+		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 
-	/*
-	 *	Flush the unicast and multicast chains
-	 */
-	dev_unicast_flush(dev);
-	dev_addr_discard(dev);
+		/*
+		 *	Flush the unicast and multicast chains
+		 */
+		dev_unicast_flush(dev);
+		dev_addr_discard(dev);
 
-	if (dev->netdev_ops->ndo_uninit)
-		dev->netdev_ops->ndo_uninit(dev);
+		if (dev->netdev_ops->ndo_uninit)
+			dev->netdev_ops->ndo_uninit(dev);
 
-	/* Notifier chain MUST detach us from master device. */
-	WARN_ON(dev->master);
+		/* Notifier chain MUST detach us from master device. */
+		WARN_ON(dev->master);
 
-	/* Remove entries from kobject tree */
-	netdev_unregister_kobject(dev);
+		/* Remove entries from kobject tree */
+		netdev_unregister_kobject(dev);
+	}
 
 	synchronize_net();
 
-	dev_put(dev);
+	list_for_each_entry(dev, head, unreg_list)
+		dev_put(dev);
+}
+
+static void rollback_registered(struct net_device *dev)
+{
+	LIST_HEAD(single);
+
+	list_add(&dev->unreg_list, &single);
+	rollback_registered_many(&single);
 }
 
 static void __netdev_init_queue_locks_one(struct net_device *dev,
@@ -5271,6 +5288,22 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
 }
 EXPORT_SYMBOL(unregister_netdevice_queue);
 
+/**
+ *	unregister_netdevice_many - unregister many devices
+ *	@head: list of devices
+ *
+ */
+void unregister_netdevice_many(struct list_head *head)
+{
+	struct net_device *dev;
+
+	if (!list_empty(head)) {
+		rollback_registered_many(head);
+		list_for_each_entry(dev, head, unreg_list)
+			net_set_todo(dev);
+	}
+}
+
 /**
  *	unregister_netdev - remove device from the kernel
  *	@dev: device
-- 
cgit v1.2.3


From 63c8099d90096db56ee1c66c31f05d4fcfbc1c69 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 27 Oct 2009 07:06:49 +0000
Subject: vlan: Optimize multiple unregistration

Use unregister_netdevice_many() to speedup master device unregister.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h |  1 +
 net/8021q/vlan.c        | 49 +++++++++++++++++++++++++++++++++----------------
 net/core/dev.c          |  1 +
 3 files changed, 35 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 8898cbebcf34..71a4870c09a9 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -85,6 +85,7 @@ struct vlan_group {
 					    * the vlan is attached to.
 					    */
 	unsigned int		nr_vlans;
+	int			killall;
 	struct hlist_node	hlist;	/* linked list */
 	struct net_device **vlan_devices_arrays[VLAN_GROUP_ARRAY_SPLIT_PARTS];
 	struct rcu_head		rcu;
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 6b5c9dddaa72..511afe72af31 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -159,11 +159,12 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
 	if (real_dev->features & NETIF_F_HW_VLAN_FILTER)
 		ops->ndo_vlan_rx_kill_vid(real_dev, vlan_id);
 
-	vlan_group_set_device(grp, vlan_id, NULL);
 	grp->nr_vlans--;
 
-	synchronize_net();
-
+	if (!grp->killall) {
+		vlan_group_set_device(grp, vlan_id, NULL);
+		synchronize_net();
+	}
 	unregister_netdevice_queue(dev, head);
 
 	/* If the group is now empty, kill off the group. */
@@ -183,6 +184,34 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
 	dev_put(real_dev);
 }
 
+void unregister_vlan_dev_alls(struct vlan_group *grp)
+{
+	LIST_HEAD(list);
+	int i;
+	struct net_device *vlandev;
+	struct vlan_group save;
+
+	memcpy(&save, grp, sizeof(save));
+	memset(&grp->vlan_devices_arrays, 0, sizeof(grp->vlan_devices_arrays));
+	grp->killall = 1;
+
+	synchronize_net();
+
+	/* Delete all VLANs for this dev. */
+	for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
+		vlandev = vlan_group_get_device(&save, i);
+		if (!vlandev)
+			continue;
+
+		unregister_vlan_dev(vlandev, &list);
+		if (grp->nr_vlans == 0)
+			break;
+	}
+	unregister_netdevice_many(&list);
+	for (i = 0; i < VLAN_GROUP_ARRAY_SPLIT_PARTS; i++)
+		kfree(save.vlan_devices_arrays[i]);
+}
+
 static void vlan_transfer_operstate(const struct net_device *dev,
 				    struct net_device *vlandev)
 {
@@ -524,19 +553,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 		break;
 
 	case NETDEV_UNREGISTER:
-		/* Delete all VLANs for this dev. */
-		for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
-			vlandev = vlan_group_get_device(grp, i);
-			if (!vlandev)
-				continue;
-
-			/* unregistration of last vlan destroys group, abort
-			 * afterwards */
-			if (grp->nr_vlans == 1)
-				i = VLAN_GROUP_ARRAY_LEN;
-
-			unregister_vlan_dev(vlandev, NULL);
-		}
+		unregister_vlan_dev_alls(grp);
 		break;
 	}
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 4513dfd5718e..09551cc143a9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5303,6 +5303,7 @@ void unregister_netdevice_many(struct list_head *head)
 			net_set_todo(dev);
 	}
 }
+EXPORT_SYMBOL(unregister_netdevice_many);
 
 /**
  *	unregister_netdev - remove device from the kernel
-- 
cgit v1.2.3


From ff76ec18cabb12a6c8f3c65bd1d23f1a770fe908 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 28 Oct 2009 12:26:39 -0700
Subject: tpm: fix header for modular build

Fix build for TCG_TPM=m.  Header file doesn't handle this
and incorrectly builds stubs.

drivers/char/tpm/tpm.c:720: error: redefinition of 'tpm_pcr_read'
include/linux/tpm.h:35: error:previous definition of 'tpm_pcr_read' was here
drivers/char/tpm/tpm.c:752: error: redefinition of 'tpm_pcr_extend'
include/linux/tpm.h:38: error:previous definition of 'tpm_pcr_extend' was here

Repairs linux-next's

commit d6ba452128178091dab7a04d54f7e66fdc32fb39
Author: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date:   Mon Oct 26 09:26:18 2009 -0400

    tpm add default function definitions

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Rajiv Andrade <srajiv@linux.vnet.ibm.com>
Cc: Mimi Zohar <zohar@us.ibm.com>
Cc: James Morris <jmorris@namei.org>
Cc: Eric Paris <eparis@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/tpm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index 8eaa8f83effb..ac5d1c1285d9 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -27,7 +27,7 @@
  */
 #define	TPM_ANY_NUM 0xFFFF
 
-#if defined(CONFIG_TCG_TPM)
+#if defined(CONFIG_TCG_TPM) || defined(CONFIG_TCG_TPM_MODULE)
 
 extern int tpm_pcr_read(u32 chip_num, int pcr_idx, u8 *res_buf);
 extern int tpm_pcr_extend(u32 chip_num, int pcr_idx, const u8 *hash);
-- 
cgit v1.2.3


From df5c79452f26f2a3d0883a213102515cfeb7aae9 Mon Sep 17 00:00:00 2001
From: Yi Zou <yi.zou@intel.com>
Date: Wed, 28 Oct 2009 18:24:35 +0000
Subject: net: Add ndo_fcoe_get_wwn to net_device_ops

Add ndo_fcoe_get_wwn so Fiber Channel over Ethernet (FCoE) can make use of
the provided World Wide Port Name (WWPN) and World Wide Node Name (WWNN)
from the underlying network interface driver.

Signed-off-by: Yi Zou <yi.zou@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e7c227d7cb98..656110a46e96 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -635,6 +635,10 @@ struct net_device_ops {
 						      unsigned int sgc);
 	int			(*ndo_fcoe_ddp_done)(struct net_device *dev,
 						     u16 xid);
+#define NETDEV_FCOE_WWNN 0
+#define NETDEV_FCOE_WWPN 1
+	int			(*ndo_fcoe_get_wwn)(struct net_device *dev,
+						    u64 *wwn, int type);
 #endif
 };
 
-- 
cgit v1.2.3


From 1aba721eba1d84a2defce45b950272cee1e6c72a Mon Sep 17 00:00:00 2001
From: Gilad Ben-Yossef <gilad@codefidence.com>
Date: Wed, 28 Oct 2009 04:15:24 +0000
Subject: Add the no SACK route option feature

Implement querying and acting upon the no sack bit in the features
field.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h | 2 +-
 net/ipv4/tcp_input.c      | 3 ++-
 net/ipv4/tcp_output.c     | 4 +++-
 3 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index adf2068d12b5..9c802a6b04d3 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -377,7 +377,7 @@ enum
 #define RTAX_MAX (__RTAX_MAX - 1)
 
 #define RTAX_FEATURE_ECN	0x00000001
-#define RTAX_FEATURE_SACK	0x00000002
+#define RTAX_FEATURE_NO_SACK	0x00000002
 #define RTAX_FEATURE_TIMESTAMP	0x00000004
 #define RTAX_FEATURE_ALLFRAG	0x00000008
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c7625005486d..5fb25f977451 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3763,7 +3763,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
 				break;
 			case TCPOPT_SACK_PERM:
 				if (opsize == TCPOLEN_SACK_PERM && th->syn &&
-				    !estab && sysctl_tcp_sack) {
+				    !estab && sysctl_tcp_sack &&
+				    !dst_feature(dst, RTAX_FEATURE_NO_SACK)) {
 					opt_rx->sack_ok = 1;
 					tcp_sack_reset(opt_rx);
 				}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2e2eb74ac4cc..b35802af3c4c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -464,6 +464,7 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 				struct tcp_md5sig_key **md5) {
 	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned size = 0;
+	struct dst_entry *dst = __sk_dst_get(sk);
 
 #ifdef CONFIG_TCP_MD5SIG
 	*md5 = tp->af_specific->md5_lookup(sk, sk);
@@ -498,7 +499,8 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 		opts->options |= OPTION_WSCALE;
 		size += TCPOLEN_WSCALE_ALIGNED;
 	}
-	if (likely(sysctl_tcp_sack)) {
+	if (likely(sysctl_tcp_sack &&
+		   !dst_feature(dst, RTAX_FEATURE_NO_SACK))) {
 		opts->options |= OPTION_SACK_ADVERTISE;
 		if (unlikely(!(OPTION_TS & opts->options)))
 			size += TCPOLEN_SACKPERM_ALIGNED;
-- 
cgit v1.2.3


From cda42ebd67ee5fdf09d7057b5a4584d36fe8a335 Mon Sep 17 00:00:00 2001
From: Gilad Ben-Yossef <gilad@codefidence.com>
Date: Wed, 28 Oct 2009 04:15:25 +0000
Subject: Allow disabling TCP timestamp options per route

Implement querying and acting upon the no timestamp bit in the feature
field.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h | 2 +-
 net/ipv4/tcp_input.c      | 3 ++-
 net/ipv4/tcp_output.c     | 8 ++++++--
 3 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 9c802a6b04d3..2ab8c758b46c 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -378,7 +378,7 @@ enum
 
 #define RTAX_FEATURE_ECN	0x00000001
 #define RTAX_FEATURE_NO_SACK	0x00000002
-#define RTAX_FEATURE_TIMESTAMP	0x00000004
+#define RTAX_FEATURE_NO_TSTAMP	0x00000004
 #define RTAX_FEATURE_ALLFRAG	0x00000008
 
 struct rta_session
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5fb25f977451..6097491aa9fc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3755,7 +3755,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
 			case TCPOPT_TIMESTAMP:
 				if ((opsize == TCPOLEN_TIMESTAMP) &&
 				    ((estab && opt_rx->tstamp_ok) ||
-				     (!estab && sysctl_tcp_timestamps))) {
+				     (!estab && sysctl_tcp_timestamps &&
+				      !dst_feature(dst, RTAX_FEATURE_NO_TSTAMP)))) {
 					opt_rx->saw_tstamp = 1;
 					opt_rx->rcv_tsval = get_unaligned_be32(ptr);
 					opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b35802af3c4c..8819eba8ebb8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -488,7 +488,9 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 	opts->mss = tcp_advertise_mss(sk);
 	size += TCPOLEN_MSS_ALIGNED;
 
-	if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
+	if (likely(sysctl_tcp_timestamps &&
+		   !dst_feature(dst, RTAX_FEATURE_NO_TSTAMP) &&
+		   *md5 == NULL)) {
 		opts->options |= OPTION_TS;
 		opts->tsval = TCP_SKB_CB(skb)->when;
 		opts->tsecr = tp->rx_opt.ts_recent;
@@ -2317,7 +2319,9 @@ static void tcp_connect_init(struct sock *sk)
 	 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
 	 */
 	tp->tcp_header_len = sizeof(struct tcphdr) +
-		(sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
+		(sysctl_tcp_timestamps &&
+		(!dst_feature(dst, RTAX_FEATURE_NO_TSTAMP) ?
+		  TCPOLEN_TSTAMP_ALIGNED : 0));
 
 #ifdef CONFIG_TCP_MD5SIG
 	if (tp->af_specific->md5_lookup(sk, sk) != NULL)
-- 
cgit v1.2.3


From 345cda2fd695534be5a4494f1b59da9daed33663 Mon Sep 17 00:00:00 2001
From: Gilad Ben-Yossef <gilad@codefidence.com>
Date: Wed, 28 Oct 2009 04:15:26 +0000
Subject: Allow to turn off TCP window scale opt per route

Add and use no window scale bit in the features field.

Note that this is not the same as setting a window scale of 0
as would happen with window limit on route.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h | 1 +
 net/ipv4/tcp_input.c      | 3 ++-
 net/ipv4/tcp_output.c     | 6 ++++--
 3 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 2ab8c758b46c..6784b342cbbf 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -380,6 +380,7 @@ enum
 #define RTAX_FEATURE_NO_SACK	0x00000002
 #define RTAX_FEATURE_NO_TSTAMP	0x00000004
 #define RTAX_FEATURE_ALLFRAG	0x00000008
+#define RTAX_FEATURE_NO_WSCALE	0x00000010
 
 struct rta_session
 {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6097491aa9fc..393c56921dcb 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3739,7 +3739,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
 				break;
 			case TCPOPT_WINDOW:
 				if (opsize == TCPOLEN_WINDOW && th->syn &&
-				    !estab && sysctl_tcp_window_scaling) {
+				    !estab && sysctl_tcp_window_scaling &&
+				    !dst_feature(dst, RTAX_FEATURE_NO_WSCALE)) {
 					__u8 snd_wscale = *(__u8 *)ptr;
 					opt_rx->wscale_ok = 1;
 					if (snd_wscale > 14) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8819eba8ebb8..616c686ca253 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -496,7 +496,8 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 		opts->tsecr = tp->rx_opt.ts_recent;
 		size += TCPOLEN_TSTAMP_ALIGNED;
 	}
-	if (likely(sysctl_tcp_window_scaling)) {
+	if (likely(sysctl_tcp_window_scaling &&
+		   !dst_feature(dst, RTAX_FEATURE_NO_WSCALE))) {
 		opts->ws = tp->rx_opt.rcv_wscale;
 		opts->options |= OPTION_WSCALE;
 		size += TCPOLEN_WSCALE_ALIGNED;
@@ -2347,7 +2348,8 @@ static void tcp_connect_init(struct sock *sk)
 				  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
 				  &tp->rcv_wnd,
 				  &tp->window_clamp,
-				  sysctl_tcp_window_scaling,
+				  (sysctl_tcp_window_scaling &&
+				   !dst_feature(dst, RTAX_FEATURE_NO_WSCALE)),
 				  &rcv_wscale);
 
 	tp->rx_opt.rcv_wscale = rcv_wscale;
-- 
cgit v1.2.3


From dc343475ed062e13fc260acccaab91d7d80fd5b2 Mon Sep 17 00:00:00 2001
From: Gilad Ben-Yossef <gilad@codefidence.com>
Date: Wed, 28 Oct 2009 04:15:27 +0000
Subject: Allow disabling of DSACK TCP option per route

Add and use no DSCAK bit in the features field.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h | 1 +
 net/ipv4/tcp_input.c      | 8 ++++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 6784b342cbbf..e78b60cd65a4 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -381,6 +381,7 @@ enum
 #define RTAX_FEATURE_NO_TSTAMP	0x00000004
 #define RTAX_FEATURE_ALLFRAG	0x00000008
 #define RTAX_FEATURE_NO_WSCALE	0x00000010
+#define RTAX_FEATURE_NO_DSACK	0x00000020
 
 struct rta_session
 {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 393c56921dcb..ba0eab65fe80 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4080,8 +4080,10 @@ static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
 static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
 
-	if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
+	if (tcp_is_sack(tp) && sysctl_tcp_dsack &&
+	    !dst_feature(dst, RTAX_FEATURE_NO_DSACK)) {
 		int mib_idx;
 
 		if (before(seq, tp->rcv_nxt))
@@ -4110,13 +4112,15 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
 static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
 
 	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
 	    before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
 		tcp_enter_quickack_mode(sk);
 
-		if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
+		if (tcp_is_sack(tp) && sysctl_tcp_dsack &&
+		    !dst_feature(dst, RTAX_FEATURE_NO_DSACK)) {
 			u32 end_seq = TCP_SKB_CB(skb)->end_seq;
 
 			if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
-- 
cgit v1.2.3


From fb699dfd426a189fe33b91586c15176a75c8aed0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 19 Oct 2009 19:18:49 +0000
Subject: net: Introduce dev_get_by_index_rcu()

Some workloads hit dev_base_lock rwlock pretty hard.
We can use RCU lookups to avoid touching this rwlock.

netdevices are already freed after a RCU grace period, so this patch
adds no penalty at device dismantle time.

dev_ifname() converted to dev_get_by_index_rcu()

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 +
 net/core/dev.c            | 48 +++++++++++++++++++++++++++++++++++++----------
 2 files changed, 39 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 656110a46e96..ffc3106cc037 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1139,6 +1139,7 @@ extern void		netdev_resync_ops(struct net_device *dev);
 extern int call_netdevice_notifiers(unsigned long val, struct net_device *dev);
 extern struct net_device	*dev_get_by_index(struct net *net, int ifindex);
 extern struct net_device	*__dev_get_by_index(struct net *net, int ifindex);
+extern struct net_device	*dev_get_by_index_rcu(struct net *net, int ifindex);
 extern int		dev_restart(struct net_device *dev);
 #ifdef CONFIG_NETPOLL_TRAP
 extern int		netpoll_trap(void);
diff --git a/net/core/dev.c b/net/core/dev.c
index 09551cc143a9..68a1bb68b5a8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -214,12 +214,15 @@ static int list_netdevice(struct net_device *dev)
 	write_lock_bh(&dev_base_lock);
 	list_add_tail(&dev->dev_list, &net->dev_base_head);
 	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
-	hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
+	hlist_add_head_rcu(&dev->index_hlist,
+			   dev_index_hash(net, dev->ifindex));
 	write_unlock_bh(&dev_base_lock);
 	return 0;
 }
 
-/* Device list removal */
+/* Device list removal
+ * caller must respect a RCU grace period before freeing/reusing dev
+ */
 static void unlist_netdevice(struct net_device *dev)
 {
 	ASSERT_RTNL();
@@ -228,7 +231,7 @@ static void unlist_netdevice(struct net_device *dev)
 	write_lock_bh(&dev_base_lock);
 	list_del(&dev->dev_list);
 	hlist_del(&dev->name_hlist);
-	hlist_del(&dev->index_hlist);
+	hlist_del_rcu(&dev->index_hlist);
 	write_unlock_bh(&dev_base_lock);
 }
 
@@ -646,6 +649,31 @@ struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 }
 EXPORT_SYMBOL(__dev_get_by_index);
 
+/**
+ *	dev_get_by_index_rcu - find a device by its ifindex
+ *	@net: the applicable net namespace
+ *	@ifindex: index of device
+ *
+ *	Search for an interface by index. Returns %NULL if the device
+ *	is not found or a pointer to the device. The device has not
+ *	had its reference counter increased so the caller must be careful
+ *	about locking. The caller must hold RCU lock.
+ */
+
+struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
+{
+	struct hlist_node *p;
+	struct net_device *dev;
+	struct hlist_head *head = dev_index_hash(net, ifindex);
+
+	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
+		if (dev->ifindex == ifindex)
+			return dev;
+
+	return NULL;
+}
+EXPORT_SYMBOL(dev_get_by_index_rcu);
+
 
 /**
  *	dev_get_by_index - find a device by its ifindex
@@ -662,11 +690,11 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex)
 {
 	struct net_device *dev;
 
-	read_lock(&dev_base_lock);
-	dev = __dev_get_by_index(net, ifindex);
+	rcu_read_lock();
+	dev = dev_get_by_index_rcu(net, ifindex);
 	if (dev)
 		dev_hold(dev);
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 	return dev;
 }
 EXPORT_SYMBOL(dev_get_by_index);
@@ -2939,15 +2967,15 @@ static int dev_ifname(struct net *net, struct ifreq __user *arg)
 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
 		return -EFAULT;
 
-	read_lock(&dev_base_lock);
-	dev = __dev_get_by_index(net, ifr.ifr_ifindex);
+	rcu_read_lock();
+	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
 	if (!dev) {
-		read_unlock(&dev_base_lock);
+		rcu_read_unlock();
 		return -ENODEV;
 	}
 
 	strcpy(ifr.ifr_name, dev->name);
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 
 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
 		return -EFAULT;
-- 
cgit v1.2.3


From 38bfd8f5bec496e8e0db8849e01c99a33479418a Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu, 29 Oct 2009 02:59:18 -0700
Subject: net,socket: introduce DECLARE_SOCKADDR helper to catch overflow at
 build time

proto_ops->getname implies copying protocol specific data
into storage unit (particulary to __kernel_sockaddr_storage).
So when we implement new protocol support we should keep such
a detail in mind (which is easy to forget about).

Lets introduce DECLARE_SOCKADDR helper which check if
storage unit is not overfowed at build time.

Eventually inet_getname is switched to use DECLARE_SOCKADDR
(to show example of usage).

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h    | 3 +++
 include/linux/socket.h | 3 +++
 net/ipv4/af_inet.c     | 2 +-
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/net.h b/include/linux/net.h
index b42bb60fe92f..4da9d571b053 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -199,6 +199,9 @@ struct proto_ops {
 				       struct pipe_inode_info *pipe, size_t len, unsigned int flags);
 };
 
+#define DECLARE_SOCKADDR(type, dst, src)	\
+	type dst = ({ __sockaddr_check_size(sizeof(*dst)); (type) src; })
+
 struct net_proto_family {
 	int		family;
 	int		(*create)(struct net *net, struct socket *sock, int protocol);
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 59966f12990c..7b3aae2052a6 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -24,6 +24,9 @@ struct __kernel_sockaddr_storage {
 #include <linux/types.h>		/* pid_t			*/
 #include <linux/compiler.h>		/* __user			*/
 
+#define __sockaddr_check_size(size)	\
+	BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage)))
+
 #ifdef __KERNEL__
 # ifdef CONFIG_PROC_FS
 struct seq_file;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 04a14b1600ac..538e84d0bcba 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -685,7 +685,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 {
 	struct sock *sk		= sock->sk;
 	struct inet_sock *inet	= inet_sk(sk);
-	struct sockaddr_in *sin	= (struct sockaddr_in *)uaddr;
+	DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);
 
 	sin->sin_family = AF_INET;
 	if (peer) {
-- 
cgit v1.2.3


From b9d128f1088ea5245109dfc9bbceb128b6371a77 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 29 Oct 2009 13:59:26 +0100
Subject: block: move bdi/address_space unplug functions to backing-dev.h

There's nothing block related about them, the backing device
is used by things like NFS etc as well. This gets rid of the
need to protect such calls by CONFIG_BLOCK.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/aio.c                    |  1 +
 include/linux/backing-dev.h | 13 +++++++++++++
 include/linux/blkdev.h      | 13 -------------
 3 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/aio.c b/fs/aio.c
index cf0bef428f88..c30dfc006108 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -15,6 +15,7 @@
 #include <linux/aio_abi.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/backing-dev.h>
 #include <linux/uio.h>
 
 #define DEBUG 0
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index b449e738533a..fcbc26af00e4 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -331,4 +331,17 @@ static inline int bdi_sched_wait(void *word)
 	return 0;
 }
 
+static inline void blk_run_backing_dev(struct backing_dev_info *bdi,
+				       struct page *page)
+{
+	if (bdi && bdi->unplug_io_fn)
+		bdi->unplug_io_fn(bdi, page);
+}
+
+static inline void blk_run_address_space(struct address_space *mapping)
+{
+	if (mapping)
+		blk_run_backing_dev(mapping->backing_dev_info, NULL);
+}
+
 #endif		/* _LINUX_BACKING_DEV_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 221cecd86bd3..39c601f783a0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -823,19 +823,6 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 	return bdev->bd_disk->queue;
 }
 
-static inline void blk_run_backing_dev(struct backing_dev_info *bdi,
-				       struct page *page)
-{
-	if (bdi && bdi->unplug_io_fn)
-		bdi->unplug_io_fn(bdi, page);
-}
-
-static inline void blk_run_address_space(struct address_space *mapping)
-{
-	if (mapping)
-		blk_run_backing_dev(mapping->backing_dev_info, NULL);
-}
-
 /*
  * blk_rq_pos()			: the current sector
  * blk_rq_bytes()		: bytes left in the entire request
-- 
cgit v1.2.3


From 5975c725dfd6f7d36f493ab1453fbdbd35c1f0e3 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Thu, 29 Oct 2009 11:40:17 -0500
Subject: define convenient securebits masks for prctl users (v2)

Hi James, would you mind taking the following into
security-testing?

The securebits are used by passing them to prctl with the
PR_{S,G}ET_SECUREBITS commands.  But the defines must be
shifted to be used in prctl, which begs to be confused and
misused by userspace.  So define some more convenient
values for userspace to specify.  This way userspace does

	prctl(PR_SET_SECUREBITS, SECBIT_NOROOT);

instead of

	prctl(PR_SET_SECUREBITS, 1 << SECURE_NOROOT);

(Thanks to Michael for the idea)

This patch also adds include/linux/securebits to the installed headers.
Then perhaps it can be included by glibc's sys/prctl.h.

Changelog:
	Oct 29: Stephen Rothwell points out that issecure can
		be under __KERNEL__.
	Oct 14: (Suggestions by Michael Kerrisk):
		1. spell out SETUID in SECBIT_NO_SETUID*
		2. SECBIT_X_LOCKED does not imply SECBIT_X
		3. add definitions for keepcaps
        Oct 14: As suggested by Michael Kerrisk, don't
		use SB_* as that convention is already in
		use.  Use SECBIT_ prefix instead.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Acked-by: Andrew G. Morgan <morgan@kernel.org>
Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/Kbuild       |  1 +
 include/linux/securebits.h | 24 ++++++++++++++++++------
 2 files changed, 19 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index cff4a101f266..ffcdb9b509db 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -329,6 +329,7 @@ unifdef-y += scc.h
 unifdef-y += sched.h
 unifdef-y += screen_info.h
 unifdef-y += sdla.h
+unifdef-y += securebits.h
 unifdef-y += selinux_netlink.h
 unifdef-y += sem.h
 unifdef-y += serial_core.h
diff --git a/include/linux/securebits.h b/include/linux/securebits.h
index d2c5ed845bcc..33406174cbe8 100644
--- a/include/linux/securebits.h
+++ b/include/linux/securebits.h
@@ -1,6 +1,15 @@
 #ifndef _LINUX_SECUREBITS_H
 #define _LINUX_SECUREBITS_H 1
 
+/* Each securesetting is implemented using two bits. One bit specifies
+   whether the setting is on or off. The other bit specify whether the
+   setting is locked or not. A setting which is locked cannot be
+   changed from user-level. */
+#define issecure_mask(X)	(1 << (X))
+#ifdef __KERNEL__
+#define issecure(X)		(issecure_mask(X) & current_cred_xxx(securebits))
+#endif
+
 #define SECUREBITS_DEFAULT 0x00000000
 
 /* When set UID 0 has no special privileges. When unset, we support
@@ -12,6 +21,9 @@
 #define SECURE_NOROOT			0
 #define SECURE_NOROOT_LOCKED		1  /* make bit-0 immutable */
 
+#define SECBIT_NOROOT		(issecure_mask(SECURE_NOROOT))
+#define SECBIT_NOROOT_LOCKED	(issecure_mask(SECURE_NOROOT_LOCKED))
+
 /* When set, setuid to/from uid 0 does not trigger capability-"fixup".
    When unset, to provide compatiblility with old programs relying on
    set*uid to gain/lose privilege, transitions to/from uid 0 cause
@@ -19,6 +31,10 @@
 #define SECURE_NO_SETUID_FIXUP		2
 #define SECURE_NO_SETUID_FIXUP_LOCKED	3  /* make bit-2 immutable */
 
+#define SECBIT_NO_SETUID_FIXUP	(issecure_mask(SECURE_NO_SETUID_FIXUP))
+#define SECBIT_NO_SETUID_FIXUP_LOCKED \
+			(issecure_mask(SECURE_NO_SETUID_FIXUP_LOCKED))
+
 /* When set, a process can retain its capabilities even after
    transitioning to a non-root user (the set-uid fixup suppressed by
    bit 2). Bit-4 is cleared when a process calls exec(); setting both
@@ -27,12 +43,8 @@
 #define SECURE_KEEP_CAPS		4
 #define SECURE_KEEP_CAPS_LOCKED		5  /* make bit-4 immutable */
 
-/* Each securesetting is implemented using two bits. One bit specifies
-   whether the setting is on or off. The other bit specify whether the
-   setting is locked or not. A setting which is locked cannot be
-   changed from user-level. */
-#define issecure_mask(X)	(1 << (X))
-#define issecure(X)		(issecure_mask(X) & current_cred_xxx(securebits))
+#define SECBIT_KEEP_CAPS	(issecure_mask(SECURE_KEEP_CAPS))
+#define SECBIT_KEEP_CAPS_LOCKED (issecure_mask(SECURE_KEEP_CAPS_LOCKED))
 
 #define SECURE_ALL_BITS		(issecure_mask(SECURE_NOROOT) | \
 				 issecure_mask(SECURE_NO_SETUID_FIXUP) | \
-- 
cgit v1.2.3


From 5b252f0c2f98df21fadf0f6cf189b87a0b938228 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Thu, 29 Oct 2009 07:17:09 +0000
Subject: gro: Name the GRO result enumeration type

This clarifies which return and parameter types are GRO result codes
and not RX result codes.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 10 ++++++----
 net/8021q/vlan_core.c     |  5 +++--
 net/core/dev.c            | 19 ++++++++++++++-----
 3 files changed, 23 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ffc3106cc037..6e777efe149e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -348,13 +348,14 @@ enum
 	NAPI_STATE_NPSVC,	/* Netpoll - don't dequeue from poll_list */
 };
 
-enum {
+enum gro_result {
 	GRO_MERGED,
 	GRO_MERGED_FREE,
 	GRO_HELD,
 	GRO_NORMAL,
 	GRO_DROP,
 };
+typedef enum gro_result gro_result_t;
 
 extern void __napi_schedule(struct napi_struct *n);
 
@@ -1480,16 +1481,17 @@ extern int		netif_rx_ni(struct sk_buff *skb);
 #define HAVE_NETIF_RECEIVE_SKB 1
 extern int		netif_receive_skb(struct sk_buff *skb);
 extern void		napi_gro_flush(struct napi_struct *napi);
-extern int		dev_gro_receive(struct napi_struct *napi,
+extern gro_result_t	dev_gro_receive(struct napi_struct *napi,
 					struct sk_buff *skb);
-extern int		napi_skb_finish(int ret, struct sk_buff *skb);
+extern int		napi_skb_finish(gro_result_t ret, struct sk_buff *skb);
 extern int		napi_gro_receive(struct napi_struct *napi,
 					 struct sk_buff *skb);
 extern void		napi_reuse_skb(struct napi_struct *napi,
 				       struct sk_buff *skb);
 extern struct sk_buff *	napi_get_frags(struct napi_struct *napi);
 extern int		napi_frags_finish(struct napi_struct *napi,
-					  struct sk_buff *skb, int ret);
+					  struct sk_buff *skb,
+					  gro_result_t ret);
 extern struct sk_buff *	napi_frags_skb(struct napi_struct *napi);
 extern int		napi_gro_frags(struct napi_struct *napi);
 
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 7f7de1a04de6..47a80d65c3b7 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -74,8 +74,9 @@ u16 vlan_dev_vlan_id(const struct net_device *dev)
 }
 EXPORT_SYMBOL(vlan_dev_vlan_id);
 
-static int vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp,
-			   unsigned int vlan_tci, struct sk_buff *skb)
+static gro_result_t
+vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp,
+		unsigned int vlan_tci, struct sk_buff *skb)
 {
 	struct sk_buff *p;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 68a1bb68b5a8..1dc13744684c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2476,7 +2476,7 @@ void napi_gro_flush(struct napi_struct *napi)
 }
 EXPORT_SYMBOL(napi_gro_flush);
 
-int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
 	struct sk_buff **pp = NULL;
 	struct packet_type *ptype;
@@ -2484,7 +2484,7 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
 	int same_flow;
 	int mac_len;
-	int ret;
+	enum gro_result ret;
 
 	if (!(skb->dev->features & NETIF_F_GRO))
 		goto normal;
@@ -2568,7 +2568,8 @@ normal:
 }
 EXPORT_SYMBOL(dev_gro_receive);
 
-static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+static gro_result_t
+__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
 	struct sk_buff *p;
 
@@ -2585,7 +2586,7 @@ static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	return dev_gro_receive(napi, skb);
 }
 
-int napi_skb_finish(int ret, struct sk_buff *skb)
+int napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 {
 	int err = NET_RX_SUCCESS;
 
@@ -2600,6 +2601,10 @@ int napi_skb_finish(int ret, struct sk_buff *skb)
 	case GRO_MERGED_FREE:
 		kfree_skb(skb);
 		break;
+
+	case GRO_HELD:
+	case GRO_MERGED:
+		break;
 	}
 
 	return err;
@@ -2652,7 +2657,8 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)
 }
 EXPORT_SYMBOL(napi_get_frags);
 
-int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
+int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
+		      gro_result_t ret)
 {
 	int err = NET_RX_SUCCESS;
 
@@ -2674,6 +2680,9 @@ int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
 	case GRO_MERGED_FREE:
 		napi_reuse_skb(napi, skb);
 		break;
+
+	case GRO_MERGED:
+		break;
 	}
 
 	return err;
-- 
cgit v1.2.3


From c7c4b3b6e976b95facbb723951bdcd554a3530a4 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Thu, 29 Oct 2009 21:36:53 -0700
Subject: gro: Change all receive functions to return GRO result codes

This will allow drivers to adjust their receive path dynamically
based on whether GRO is being applied successfully.

Currently all in-tree callers ignore the return values of these
functions and do not need to be changed.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h   | 25 ++++++++++++++-----------
 include/linux/netdevice.h |  8 ++++----
 net/8021q/vlan_core.c     | 16 +++++++++-------
 net/core/dev.c            | 38 +++++++++++++++-----------------------
 4 files changed, 42 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 71a4870c09a9..153f6b9e722c 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -120,10 +120,12 @@ extern u16 vlan_dev_vlan_id(const struct net_device *dev);
 extern int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
 			     u16 vlan_tci, int polling);
 extern int vlan_hwaccel_do_receive(struct sk_buff *skb);
-extern int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
-			    unsigned int vlan_tci, struct sk_buff *skb);
-extern int vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
-			  unsigned int vlan_tci);
+extern gro_result_t
+vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
+		 unsigned int vlan_tci, struct sk_buff *skb);
+extern gro_result_t
+vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
+	       unsigned int vlan_tci);
 
 #else
 static inline struct net_device *vlan_dev_real_dev(const struct net_device *dev)
@@ -150,17 +152,18 @@ static inline int vlan_hwaccel_do_receive(struct sk_buff *skb)
 	return 0;
 }
 
-static inline int vlan_gro_receive(struct napi_struct *napi,
-				   struct vlan_group *grp,
-				   unsigned int vlan_tci, struct sk_buff *skb)
+static inline gro_result_t
+vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
+		 unsigned int vlan_tci, struct sk_buff *skb)
 {
-	return NET_RX_DROP;
+	return GRO_DROP;
 }
 
-static inline int vlan_gro_frags(struct napi_struct *napi,
-				 struct vlan_group *grp, unsigned int vlan_tci)
+static inline gro_result_t
+vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
+	       unsigned int vlan_tci)
 {
-	return NET_RX_DROP;
+	return GRO_DROP;
 }
 #endif
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6e777efe149e..193b637889f9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1483,17 +1483,17 @@ extern int		netif_receive_skb(struct sk_buff *skb);
 extern void		napi_gro_flush(struct napi_struct *napi);
 extern gro_result_t	dev_gro_receive(struct napi_struct *napi,
 					struct sk_buff *skb);
-extern int		napi_skb_finish(gro_result_t ret, struct sk_buff *skb);
-extern int		napi_gro_receive(struct napi_struct *napi,
+extern gro_result_t	napi_skb_finish(gro_result_t ret, struct sk_buff *skb);
+extern gro_result_t	napi_gro_receive(struct napi_struct *napi,
 					 struct sk_buff *skb);
 extern void		napi_reuse_skb(struct napi_struct *napi,
 				       struct sk_buff *skb);
 extern struct sk_buff *	napi_get_frags(struct napi_struct *napi);
-extern int		napi_frags_finish(struct napi_struct *napi,
+extern gro_result_t	napi_frags_finish(struct napi_struct *napi,
 					  struct sk_buff *skb,
 					  gro_result_t ret);
 extern struct sk_buff *	napi_frags_skb(struct napi_struct *napi);
-extern int		napi_gro_frags(struct napi_struct *napi);
+extern gro_result_t	napi_gro_frags(struct napi_struct *napi);
 
 static inline void napi_free_frags(struct napi_struct *napi)
 {
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 47a80d65c3b7..8d5ca2ac4f8d 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -102,11 +102,12 @@ drop:
 	return GRO_DROP;
 }
 
-int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
-		     unsigned int vlan_tci, struct sk_buff *skb)
+gro_result_t vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
+			      unsigned int vlan_tci, struct sk_buff *skb)
 {
 	if (netpoll_rx_on(skb))
-		return vlan_hwaccel_receive_skb(skb, grp, vlan_tci);
+		return vlan_hwaccel_receive_skb(skb, grp, vlan_tci)
+			? GRO_DROP : GRO_NORMAL;
 
 	skb_gro_reset_offset(skb);
 
@@ -114,17 +115,18 @@ int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
 }
 EXPORT_SYMBOL(vlan_gro_receive);
 
-int vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
-		   unsigned int vlan_tci)
+gro_result_t vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
+			    unsigned int vlan_tci)
 {
 	struct sk_buff *skb = napi_frags_skb(napi);
 
 	if (!skb)
-		return NET_RX_DROP;
+		return GRO_DROP;
 
 	if (netpoll_rx_on(skb)) {
 		skb->protocol = eth_type_trans(skb, skb->dev);
-		return vlan_hwaccel_receive_skb(skb, grp, vlan_tci);
+		return vlan_hwaccel_receive_skb(skb, grp, vlan_tci)
+			? GRO_DROP : GRO_NORMAL;
 	}
 
 	return napi_frags_finish(napi, skb,
diff --git a/net/core/dev.c b/net/core/dev.c
index 1dc13744684c..631cc40da197 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2586,18 +2586,15 @@ __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	return dev_gro_receive(napi, skb);
 }
 
-int napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
+gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 {
-	int err = NET_RX_SUCCESS;
-
 	switch (ret) {
 	case GRO_NORMAL:
-		return netif_receive_skb(skb);
+		if (netif_receive_skb(skb))
+			ret = GRO_DROP;
+		break;
 
 	case GRO_DROP:
-		err = NET_RX_DROP;
-		/* fall through */
-
 	case GRO_MERGED_FREE:
 		kfree_skb(skb);
 		break;
@@ -2607,7 +2604,7 @@ int napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 		break;
 	}
 
-	return err;
+	return ret;
 }
 EXPORT_SYMBOL(napi_skb_finish);
 
@@ -2627,7 +2624,7 @@ void skb_gro_reset_offset(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(skb_gro_reset_offset);
 
-int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
 	skb_gro_reset_offset(skb);
 
@@ -2657,26 +2654,21 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)
 }
 EXPORT_SYMBOL(napi_get_frags);
 
-int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
-		      gro_result_t ret)
+gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
+			       gro_result_t ret)
 {
-	int err = NET_RX_SUCCESS;
-
 	switch (ret) {
 	case GRO_NORMAL:
 	case GRO_HELD:
 		skb->protocol = eth_type_trans(skb, napi->dev);
 
-		if (ret == GRO_NORMAL)
-			return netif_receive_skb(skb);
-
-		skb_gro_pull(skb, -ETH_HLEN);
+		if (ret == GRO_HELD)
+			skb_gro_pull(skb, -ETH_HLEN);
+		else if (netif_receive_skb(skb))
+			ret = GRO_DROP;
 		break;
 
 	case GRO_DROP:
-		err = NET_RX_DROP;
-		/* fall through */
-
 	case GRO_MERGED_FREE:
 		napi_reuse_skb(napi, skb);
 		break;
@@ -2685,7 +2677,7 @@ int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
 		break;
 	}
 
-	return err;
+	return ret;
 }
 EXPORT_SYMBOL(napi_frags_finish);
 
@@ -2726,12 +2718,12 @@ out:
 }
 EXPORT_SYMBOL(napi_frags_skb);
 
-int napi_gro_frags(struct napi_struct *napi)
+gro_result_t napi_gro_frags(struct napi_struct *napi)
 {
 	struct sk_buff *skb = napi_frags_skb(napi);
 
 	if (!skb)
-		return NET_RX_DROP;
+		return GRO_DROP;
 
 	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
 }
-- 
cgit v1.2.3


From 0c509a6c9393b27a8c5a01acd4a72616206cfc24 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@aristanetworks.com>
Date: Thu, 29 Oct 2009 14:18:21 +0000
Subject: net: Allow devices to specify a device specific sysfs group.

This isn't beautifully abstracted, but it is simple,
simplifies uses and so far is only needed for the bonding driver.

Signed-off-by: Eric W. Biederman <ebiederm@aristanetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 4 ++--
 net/core/net-sysfs.c      | 5 ++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 193b637889f9..e5ece8dceaad 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -900,8 +900,8 @@ struct net_device
 
 	/* class/net/name entry */
 	struct device		dev;
-	/* space for optional statistics and wireless sysfs groups */
-	const struct attribute_group *sysfs_groups[3];
+	/* space for optional device, statistics, and wireless sysfs groups */
+	const struct attribute_group *sysfs_groups[4];
 
 	/* rtnetlink link ops */
 	const struct rtnl_link_ops *rtnl_link_ops;
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 89de182353b0..157645c0da73 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -544,8 +544,11 @@ int netdev_register_kobject(struct net_device *net)
 	dev_set_name(dev, "%s", net->name);
 
 #ifdef CONFIG_SYSFS
-	*groups++ = &netstat_group;
+	/* Allow for a device specific group */
+	if (*groups)
+		groups++;
 
+	*groups++ = &netstat_group;
 #ifdef CONFIG_WIRELESS_EXT_SYSFS
 	if (net->ieee80211_ptr)
 		*groups++ = &wireless_group;
-- 
cgit v1.2.3


From 22403def134e2c1017cb04ae9129a38e841b2d8c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Fri, 30 Oct 2009 12:55:03 +0100
Subject: mac80211: also drop qos-nullfunc frames silently

We drop nullfunc frames, but not qos-nullfunc frames,
even though those could be used for PS state control
as well.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/ieee80211.h | 12 +++++++++++-
 net/mac80211/rx.c         | 15 ++++++++++-----
 2 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 52e15e079c61..0aa831467493 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -472,7 +472,7 @@ static inline int ieee80211_is_cfendack(__le16 fc)
 }
 
 /**
- * ieee80211_is_nullfunc - check if FTYPE=IEEE80211_FTYPE_DATA and STYPE=IEEE80211_STYPE_NULLFUNC
+ * ieee80211_is_nullfunc - check if frame is a regular (non-QoS) nullfunc frame
  * @fc: frame control bytes in little-endian byteorder
  */
 static inline int ieee80211_is_nullfunc(__le16 fc)
@@ -481,6 +481,16 @@ static inline int ieee80211_is_nullfunc(__le16 fc)
 	       cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC);
 }
 
+/**
+ * ieee80211_is_qos_nullfunc - check if frame is a QoS nullfunc frame
+ * @fc: frame control bytes in little-endian byteorder
+ */
+static inline int ieee80211_is_qos_nullfunc(__le16 fc)
+{
+	return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
+	       cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_NULLFUNC);
+}
+
 struct ieee80211s_hdr {
 	u8 flags;
 	u8 ttl;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index f862399f7ce1..51cb8bc3af81 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -886,12 +886,17 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
 		}
 	}
 
-	/* Drop data::nullfunc frames silently, since they are used only to
-	 * control station power saving mode. */
-	if (ieee80211_is_nullfunc(hdr->frame_control)) {
+	/*
+	 * Drop (qos-)data::nullfunc frames silently, since they
+	 * are used only to control station power saving mode.
+	 */
+	if (ieee80211_is_nullfunc(hdr->frame_control) ||
+	    ieee80211_is_qos_nullfunc(hdr->frame_control)) {
 		I802_DEBUG_INC(rx->local->rx_handlers_drop_nullfunc);
-		/* Update counter and free packet here to avoid counting this
-		 * as a dropped packed. */
+		/*
+		 * Update counter and free packet here to avoid
+		 * counting this as a dropped packed.
+		 */
 		sta->rx_packets++;
 		dev_kfree_skb(rx->skb);
 		return RX_QUEUED;
-- 
cgit v1.2.3


From 244546f0d3101c5441f5b14cfe8a79d62679eaea Mon Sep 17 00:00:00 2001
From: Andy Grover <andy.grover@oracle.com>
Date: Fri, 30 Oct 2009 08:54:53 +0000
Subject: RDS: Add GET_MR_FOR_DEST sockopt

RDS currently supports a GET_MR sockopt to establish a
memory region (MR) for a chunk of memory. However, the fastreg
method ties a MR to a particular destination. The GET_MR_FOR_DEST
sockopt allows the remote machine to be specified, and thus
support for fastreg (aka FRWRs).

Note that this patch does *not* do all of this - it simply
implements the new sockopt in terms of the old one, so applications
can begin to use the new sockopt in preparation for cutover to
FRWRs.

Signed-off-by: Andy Grover <andy.grover@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rds.h |  8 ++++++++
 net/rds/af_rds.c    |  3 +++
 net/rds/rdma.c      | 24 ++++++++++++++++++++++++
 net/rds/rdma.h      |  1 +
 4 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rds.h b/include/linux/rds.h
index 89d46e1afbb1..cab4994c2f63 100644
--- a/include/linux/rds.h
+++ b/include/linux/rds.h
@@ -56,6 +56,7 @@
 /* deprecated: RDS_BARRIER 4 */
 #define RDS_RECVERR			5
 #define RDS_CONG_MONITOR		6
+#define RDS_GET_MR_FOR_DEST		7
 
 /*
  * Control message types for SOL_RDS.
@@ -224,6 +225,13 @@ struct rds_get_mr_args {
 	uint64_t	flags;
 };
 
+struct rds_get_mr_for_dest_args {
+	struct sockaddr_storage	dest_addr;
+	struct rds_iovec 	vec;
+	u_int64_t		cookie_addr;
+	uint64_t		flags;
+};
+
 struct rds_free_mr_args {
 	rds_rdma_cookie_t cookie;
 	u_int64_t	flags;
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index a202e5b36079..2b978dc6e75d 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -265,6 +265,9 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
 	case RDS_GET_MR:
 		ret = rds_get_mr(rs, optval, optlen);
 		break;
+	case RDS_GET_MR_FOR_DEST:
+		ret = rds_get_mr_for_dest(rs, optval, optlen);
+		break;
 	case RDS_FREE_MR:
 		ret = rds_free_mr(rs, optval, optlen);
 		break;
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 8dc83d2caa58..971b5a668458 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -317,6 +317,30 @@ int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen)
 	return __rds_rdma_map(rs, &args, NULL, NULL);
 }
 
+int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen)
+{
+	struct rds_get_mr_for_dest_args args;
+	struct rds_get_mr_args new_args;
+
+	if (optlen != sizeof(struct rds_get_mr_for_dest_args))
+		return -EINVAL;
+
+	if (copy_from_user(&args, (struct rds_get_mr_for_dest_args __user *)optval,
+			   sizeof(struct rds_get_mr_for_dest_args)))
+		return -EFAULT;
+
+	/*
+	 * Initially, just behave like get_mr().
+	 * TODO: Implement get_mr as wrapper around this
+	 *	 and deprecate it.
+	 */
+	new_args.vec = args.vec;
+	new_args.cookie_addr = args.cookie_addr;
+	new_args.flags = args.flags;
+
+	return __rds_rdma_map(rs, &new_args, NULL, NULL);
+}
+
 /*
  * Free the MR indicated by the given R_Key
  */
diff --git a/net/rds/rdma.h b/net/rds/rdma.h
index 425512098b0b..909c39835a5d 100644
--- a/net/rds/rdma.h
+++ b/net/rds/rdma.h
@@ -61,6 +61,7 @@ static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
 }
 
 int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
 int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
 void rds_rdma_drop_keys(struct rds_sock *rs);
 int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
-- 
cgit v1.2.3


From 19593ffdb6daa6ba691d247a2400cece12687c52 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Wed, 14 Oct 2009 20:40:10 +0200
Subject: firewire: ohci: 0 may be a valid DMA address

I was told that there are obscure architectures with non-coherent DMA
which may DMA-map to bus address 0.  We shall not use 0 as a magic
number of uninitialized bus address variables.

The packet->payload_length > 0 test cannot be used either (except in
at_context_queue_packet) because local requests are not DMA-mapped
regardless of payload_length.  Hence add a state flag to struct
fw_packet.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 drivers/firewire/core-transaction.c | 4 ++--
 drivers/firewire/ohci.c             | 9 +++++----
 include/linux/firewire.h            | 1 +
 3 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index 66789c3cc561..842739df23e2 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -226,7 +226,7 @@ static void fw_fill_request(struct fw_packet *packet, int tcode, int tlabel,
 	packet->speed = speed;
 	packet->generation = generation;
 	packet->ack = 0;
-	packet->payload_bus = 0;
+	packet->payload_mapped = false;
 }
 
 /**
@@ -601,7 +601,7 @@ void fw_fill_response(struct fw_packet *response, u32 *request_header,
 		WARN(1, KERN_ERR "wrong tcode %d", tcode);
 	}
 
-	response->payload_bus = 0;
+	response->payload_mapped = false;
 }
 EXPORT_SYMBOL(fw_fill_response);
 
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index 418415564791..a71477541dc7 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -995,7 +995,8 @@ static int at_context_queue_packet(struct context *ctx,
 			packet->ack = RCODE_SEND_ERROR;
 			return -1;
 		}
-		packet->payload_bus = payload_bus;
+		packet->payload_bus	= payload_bus;
+		packet->payload_mapped	= true;
 
 		d[2].req_count    = cpu_to_le16(packet->payload_length);
 		d[2].data_address = cpu_to_le32(payload_bus);
@@ -1023,7 +1024,7 @@ static int at_context_queue_packet(struct context *ctx,
 	 */
 	if (ohci->generation != packet->generation ||
 	    reg_read(ohci, OHCI1394_IntEventSet) & OHCI1394_busReset) {
-		if (packet->payload_length > 0)
+		if (packet->payload_mapped)
 			dma_unmap_single(ohci->card.device, payload_bus,
 					 packet->payload_length, DMA_TO_DEVICE);
 		packet->ack = RCODE_GENERATION;
@@ -1059,7 +1060,7 @@ static int handle_at_packet(struct context *context,
 		/* This packet was cancelled, just continue. */
 		return 1;
 
-	if (packet->payload_bus)
+	if (packet->payload_mapped)
 		dma_unmap_single(ohci->card.device, packet->payload_bus,
 				 packet->payload_length, DMA_TO_DEVICE);
 
@@ -1723,7 +1724,7 @@ static int ohci_cancel_packet(struct fw_card *card, struct fw_packet *packet)
 	if (packet->ack != 0)
 		goto out;
 
-	if (packet->payload_bus)
+	if (packet->payload_mapped)
 		dma_unmap_single(ohci->card.device, packet->payload_bus,
 				 packet->payload_length, DMA_TO_DEVICE);
 
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 211a5d7d87b3..9416a461b696 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -267,6 +267,7 @@ struct fw_packet {
 	void *payload;
 	size_t payload_length;
 	dma_addr_t payload_bus;
+	bool payload_mapped;
 	u32 timestamp;
 
 	/*
-- 
cgit v1.2.3


From 72c9528bab94cc052d00ce241b8e85f5d71e45f0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 30 Oct 2009 07:11:27 +0000
Subject: net: Introduce dev_get_by_name_rcu()

Some workloads hit dev_base_lock rwlock pretty hard.
We can use RCU lookups to avoid touching this rwlock
(and avoid touching netdevice refcount)

netdevices are already freed after a RCU grace period, so this patch
adds no penalty at device dismantle time.

However, it adds a synchronize_rcu() call in dev_change_name()

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 +
 net/core/dev.c            | 49 ++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 41 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e5ece8dceaad..bcf1083857fc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1115,6 +1115,7 @@ extern void		__dev_remove_pack(struct packet_type *pt);
 extern struct net_device	*dev_get_by_flags(struct net *net, unsigned short flags,
 						  unsigned short mask);
 extern struct net_device	*dev_get_by_name(struct net *net, const char *name);
+extern struct net_device	*dev_get_by_name_rcu(struct net *net, const char *name);
 extern struct net_device	*__dev_get_by_name(struct net *net, const char *name);
 extern int		dev_alloc_name(struct net_device *dev, const char *name);
 extern int		dev_open(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 94f42a15fff1..f54d8b8a434b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -213,7 +213,7 @@ static int list_netdevice(struct net_device *dev)
 
 	write_lock_bh(&dev_base_lock);
 	list_add_tail(&dev->dev_list, &net->dev_base_head);
-	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
+	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 	hlist_add_head_rcu(&dev->index_hlist,
 			   dev_index_hash(net, dev->ifindex));
 	write_unlock_bh(&dev_base_lock);
@@ -230,7 +230,7 @@ static void unlist_netdevice(struct net_device *dev)
 	/* Unlink dev from the device chain */
 	write_lock_bh(&dev_base_lock);
 	list_del(&dev->dev_list);
-	hlist_del(&dev->name_hlist);
+	hlist_del_rcu(&dev->name_hlist);
 	hlist_del_rcu(&dev->index_hlist);
 	write_unlock_bh(&dev_base_lock);
 }
@@ -598,6 +598,32 @@ struct net_device *__dev_get_by_name(struct net *net, const char *name)
 }
 EXPORT_SYMBOL(__dev_get_by_name);
 
+/**
+ *	dev_get_by_name_rcu	- find a device by its name
+ *	@net: the applicable net namespace
+ *	@name: name to find
+ *
+ *	Find an interface by name.
+ *	If the name is found a pointer to the device is returned.
+ * 	If the name is not found then %NULL is returned.
+ *	The reference counters are not incremented so the caller must be
+ *	careful with locks. The caller must hold RCU lock.
+ */
+
+struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
+{
+	struct hlist_node *p;
+	struct net_device *dev;
+	struct hlist_head *head = dev_name_hash(net, name);
+
+	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
+		if (!strncmp(dev->name, name, IFNAMSIZ))
+			return dev;
+
+	return NULL;
+}
+EXPORT_SYMBOL(dev_get_by_name_rcu);
+
 /**
  *	dev_get_by_name		- find a device by its name
  *	@net: the applicable net namespace
@@ -614,11 +640,11 @@ struct net_device *dev_get_by_name(struct net *net, const char *name)
 {
 	struct net_device *dev;
 
-	read_lock(&dev_base_lock);
-	dev = __dev_get_by_name(net, name);
+	rcu_read_lock();
+	dev = dev_get_by_name_rcu(net, name);
 	if (dev)
 		dev_hold(dev);
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 	return dev;
 }
 EXPORT_SYMBOL(dev_get_by_name);
@@ -960,7 +986,12 @@ rollback:
 
 	write_lock_bh(&dev_base_lock);
 	hlist_del(&dev->name_hlist);
-	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
+	write_unlock_bh(&dev_base_lock);
+
+	synchronize_rcu();
+
+	write_lock_bh(&dev_base_lock);
+	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 	write_unlock_bh(&dev_base_lock);
 
 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
@@ -1062,9 +1093,9 @@ void dev_load(struct net *net, const char *name)
 {
 	struct net_device *dev;
 
-	read_lock(&dev_base_lock);
-	dev = __dev_get_by_name(net, name);
-	read_unlock(&dev_base_lock);
+	rcu_read_lock();
+	dev = dev_get_by_name_rcu(net, name);
+	rcu_read_unlock();
 
 	if (!dev && capable(CAP_NET_ADMIN))
 		request_module("%s", name);
-- 
cgit v1.2.3


From 4f570f995f68ef77aae7e5a441222f59232f2d0e Mon Sep 17 00:00:00 2001
From: Alberto Bertogli <albertito@blitiri.com.ar>
Date: Mon, 2 Nov 2009 11:40:16 +0100
Subject: Do not __always_inline bvec_kmap_irq() and bvec_kunmap_irq()

So remove both the comment and the inline requirement, going back to the
inline hint.

Signed-off-by: Alberto Bertogli <albertito@blitiri.com.ar>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/bio.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5be93f18d842..474792b825d0 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -450,11 +450,8 @@ extern struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly;
 /*
  * remember never ever reenable interrupts between a bvec_kmap_irq and
  * bvec_kunmap_irq!
- *
- * This function MUST be inlined - it plays with the CPU interrupt flags.
  */
-static __always_inline char *bvec_kmap_irq(struct bio_vec *bvec,
-		unsigned long *flags)
+static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
 {
 	unsigned long addr;
 
@@ -470,8 +467,7 @@ static __always_inline char *bvec_kmap_irq(struct bio_vec *bvec,
 	return (char *) addr + bvec->bv_offset;
 }
 
-static __always_inline void bvec_kunmap_irq(char *buffer,
-		unsigned long *flags)
+static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
 {
 	unsigned long ptr = (unsigned long) buffer & PAGE_MASK;
 
-- 
cgit v1.2.3


From 7b2a35132ad0a70902dcd2844c27ed64cda0ce9b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 2 Nov 2009 08:50:52 +0800
Subject: compiler: Introduce __always_unused

I wrote some code which is used as compile-time checker, and the
code should be elided after compile.

So I need to annotate the code as "always unused", compared to
"maybe unused".

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <4AEE2CEC.8040206@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/compiler-gcc.h | 1 +
 include/linux/compiler.h     | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index a3ed7cb8ca34..73dcf804bc94 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -79,6 +79,7 @@
 #define  noinline			__attribute__((noinline))
 #define __attribute_const__		__attribute__((__const__))
 #define __maybe_unused			__attribute__((unused))
+#define __always_unused			__attribute__((unused))
 
 #define __gcc_header(x) #x
 #define _gcc_header(x) __gcc_header(linux/compiler-gcc##x.h)
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 04fb5135b4e1..7947f4f6fa51 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -213,6 +213,10 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 # define __maybe_unused		/* unimplemented */
 #endif
 
+#ifndef __always_unused
+# define __always_unused	/* unimplemented */
+#endif
+
 #ifndef noinline
 #define noinline
 #endif
-- 
cgit v1.2.3


From 8649f13d2d810406da444a6101906041b796fbde Mon Sep 17 00:00:00 2001
From: Matt Carlson <mcarlson@broadcom.com>
Date: Mon, 2 Nov 2009 14:30:00 +0000
Subject: broadcom: Consolidate dev_flags definitions

This patch moves all the dev_flags enumerations outside the broadcom.c
file to include/linux/brcmphy.h.  The existing flags were not used yet
and have been re-enumerated to avoid conflicts.

Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Reviewed-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/broadcom.c | 11 +----------
 include/linux/brcmphy.h    | 17 +++++++++++------
 2 files changed, 12 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 1a2b2f2a273a..ace0ccc87a00 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -16,6 +16,7 @@
 
 #include <linux/module.h>
 #include <linux/phy.h>
+#include <linux/brcmphy.h>
 
 #define PHY_ID_BCM50610		0x0143bd60
 #define PHY_ID_BCM50610M	0x0143bd70
@@ -138,16 +139,6 @@
 #define BCM5482_SSD_SGMII_SLAVE_EN	0x0002	/* Slave mode enable */
 #define BCM5482_SSD_SGMII_SLAVE_AD	0x0001	/* Slave auto-detection */
 
-/*
- * Device flags for PHYs that can be configured for different operating
- * modes.
- */
-#define PHY_BCM_FLAGS_VALID		0x80000000
-#define PHY_BCM_FLAGS_INTF_XAUI		0x00000020
-#define PHY_BCM_FLAGS_INTF_SGMII	0x00000010
-#define PHY_BCM_FLAGS_MODE_1000BX	0x00000002
-#define PHY_BCM_FLAGS_MODE_COPPER	0x00000001
-
 
 /*****************************************************************************/
 /* Fast Ethernet Transceiver definitions. */
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 9b64b6d67873..daa1480fcf49 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -1,6 +1,11 @@
-#define PHY_BRCM_WIRESPEED_ENABLE	0x00000001
-#define PHY_BRCM_AUTO_PWRDWN_ENABLE	0x00000002
-#define PHY_BRCM_APD_CLK125_ENABLE	0x00000004
-#define PHY_BRCM_STD_IBND_DISABLE	0x00000008
-#define PHY_BRCM_EXT_IBND_RX_ENABLE	0x00000010
-#define PHY_BRCM_EXT_IBND_TX_ENABLE	0x00000020
+#define PHY_BCM_FLAGS_MODE_COPPER	0x00000001
+#define PHY_BCM_FLAGS_MODE_1000BX	0x00000002
+#define PHY_BCM_FLAGS_INTF_SGMII	0x00000010
+#define PHY_BCM_FLAGS_INTF_XAUI		0x00000020
+#define PHY_BRCM_WIRESPEED_ENABLE	0x00000100
+#define PHY_BRCM_AUTO_PWRDWN_ENABLE	0x00000200
+#define PHY_BRCM_APD_CLK125_ENABLE	0x00000400
+#define PHY_BRCM_STD_IBND_DISABLE	0x00000800
+#define PHY_BRCM_EXT_IBND_RX_ENABLE	0x00001000
+#define PHY_BRCM_EXT_IBND_TX_ENABLE	0x00002000
+#define PHY_BCM_FLAGS_VALID		0x80000000
-- 
cgit v1.2.3


From 63a14ce449dd6d647de2725809159eb072b2c44f Mon Sep 17 00:00:00 2001
From: Matt Carlson <mcarlson@broadcom.com>
Date: Mon, 2 Nov 2009 14:30:40 +0000
Subject: tg3 / broadcom: Add PHY_BRCM_CLEAR_RGMII_MODE flag

Broadcom 50610M parts changed the default definitions of the RGMII mode
shadow register.  The 5785 needs the RGMII mode selection bits [4:3]
cleared.

The default value of the remaining bits in this register are zero.
Rather than unnecessarily burn an extra bit in the dev_flags member in
an attempt to enumerate all possible combinations, this patch take a
more course grained approach and labels the option as "clear all bits".

Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Reviewed-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/broadcom.c | 6 ++++++
 drivers/net/tg3.c          | 1 +
 include/linux/brcmphy.h    | 1 +
 3 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index ace0ccc87a00..5d2a2e90aba8 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -105,6 +105,7 @@
 #define BCM5482_SHD_LEDS1_LED3(src)	((src & 0xf) << 4)
 					/* LED1 / ~LINKSPD[1] selector */
 #define BCM5482_SHD_LEDS1_LED1(src)	((src & 0xf) << 0)
+#define BCM54XX_SHD_RGMII_MODE	0x0b	/* 01011: RGMII Mode Selector */
 #define BCM5482_SHD_SSD		0x14	/* 10100: Secondary SerDes control */
 #define BCM5482_SHD_SSD_LEDM	0x0008	/* SSD LED Mode enable */
 #define BCM5482_SHD_SSD_EN	0x0001	/* SSD enable */
@@ -330,6 +331,11 @@ static int bcm54xx_config_init(struct phy_device *phydev)
 	if (err < 0)
 		return err;
 
+	if ((BRCM_PHY_MODEL(phydev) == PHY_ID_BCM50610 ||
+	     BRCM_PHY_MODEL(phydev) == PHY_ID_BCM50610M) &&
+	    (phydev->dev_flags & PHY_BRCM_CLEAR_RGMII_MODE))
+		bcm54xx_shadow_write(phydev, BCM54XX_SHD_RGMII_MODE, 0);
+
 	bcm54xx_phydsp_config(phydev);
 
 	return 0;
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 0aecd07d3f6b..e7128f6ae2d8 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -1100,6 +1100,7 @@ static int tg3_mdio_init(struct tg3 *tp)
 		break;
 	case TG3_PHY_ID_BCM50610:
 	case TG3_PHY_ID_BCM50610M:
+		phydev->dev_flags |= PHY_BRCM_CLEAR_RGMII_MODE;
 		if (tp->tg3_flags3 & TG3_FLG3_RGMII_STD_IBND_DISABLE)
 			phydev->dev_flags |= PHY_BRCM_STD_IBND_DISABLE;
 		if (tp->tg3_flags3 & TG3_FLG3_RGMII_EXT_IBND_RX_EN)
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index daa1480fcf49..6e7ffcee9c80 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -8,4 +8,5 @@
 #define PHY_BRCM_STD_IBND_DISABLE	0x00000800
 #define PHY_BRCM_EXT_IBND_RX_ENABLE	0x00001000
 #define PHY_BRCM_EXT_IBND_TX_ENABLE	0x00002000
+#define PHY_BRCM_CLEAR_RGMII_MODE	0x00004000
 #define PHY_BCM_FLAGS_VALID		0x80000000
-- 
cgit v1.2.3


From 32e5a8d651c0dbb02bf82ca954206282e44c4b11 Mon Sep 17 00:00:00 2001
From: Matt Carlson <mcarlson@broadcom.com>
Date: Mon, 2 Nov 2009 14:31:39 +0000
Subject: tg3 / broadcom: Add code to disable rxc refclk

The 5785 does not use the RXC reference clock.  Turning it off is
desirable as it saves power.

By default, the 50610 enables the RXC reference clock and the 50610M
disables it.  Presumably this is one of the reasons why the hardware
architect chose one over the other.

Adding a "rx reference clock disable" flag is not the ideal way to
describe the option, as it would force the MAC using a 50610M to set
the flag.  Ideally we want the flags to represent opt-in behavior that
deviates from hardware defaults.  Furthermore, the lack of a
"disable" flag implies that the requester wants the rx reference clock
enabled, which doesn't necessarily follow.

By presenting the option as a passive statement (rx reference clock
unused) rather than a command, I hope to convey an opt-in option to
disable the rx reference clock that falls back to hardware defaults if
not set.  A secondary benefit of this is that it keeps the
intelligence about phy defaults in the broadcom module where it belongs
and allows the broadcom module more latitude should a bug arise.

Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Reviewed-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/broadcom.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 drivers/net/tg3.c          |  3 ++-
 include/linux/brcmphy.h    |  2 +-
 3 files changed, 46 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index bddf4a42ae68..74914335f72c 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -25,6 +25,9 @@
 #define BRCM_PHY_MODEL(phydev) \
 	((phydev)->drv->phy_id & (phydev)->drv->phy_id_mask)
 
+#define BRCM_PHY_REV(phydev) \
+	((phydev)->drv->phy_id & ~((phydev)->drv->phy_id_mask))
+
 
 #define MII_BCM54XX_ECR		0x10	/* BCM54xx extended control register */
 #define MII_BCM54XX_ECR_IM	0x1000	/* Interrupt mask */
@@ -95,11 +98,16 @@
 #define BCM_LED_SRC_OFF		0xe	/* Tied high */
 #define BCM_LED_SRC_ON		0xf	/* Tied low */
 
+
 /*
  * BCM5482: Shadow registers
  * Shadow values go into bits [14:10] of register 0x1c to select a shadow
  * register to access.
  */
+/* 00101: Spare Control Register 3 */
+#define BCM54XX_SHD_SCR3		0x05
+#define  BCM54XX_SHD_SCR3_DEF_CLK125	0x0001
+
 #define BCM5482_SHD_LEDS1	0x0d	/* 01101: LED Selector 1 */
 					/* LED3 / ~LINKSPD[2] selector */
 #define BCM5482_SHD_LEDS1_LED3(src)	((src & 0xf) << 4)
@@ -112,6 +120,7 @@
 #define BCM5482_SHD_MODE	0x1f	/* 11111: Mode Control Register */
 #define BCM5482_SHD_MODE_1000BX	0x0001	/* Enable 1000BASE-X registers */
 
+
 /*
  * EXPANSION SHADOW ACCESS REGISTERS.  (PHY REG 0x15, 0x16, and 0x17)
  */
@@ -309,6 +318,37 @@ error:
 	return err ? err : err2;
 }
 
+static void bcm54xx_adjust_rxrefclk(struct phy_device *phydev)
+{
+	u32 val, orig;
+
+	/* Abort if we are using an untested phy. */
+	if (BRCM_PHY_MODEL(phydev) != PHY_ID_BCM50610 ||
+	    BRCM_PHY_MODEL(phydev) != PHY_ID_BCM50610M)
+		return;
+
+	val = bcm54xx_shadow_read(phydev, BCM54XX_SHD_SCR3);
+	if (val < 0)
+		return;
+
+	orig = val;
+
+	if (phydev->dev_flags & PHY_BRCM_RX_REFCLK_UNUSED) {
+		if ((BRCM_PHY_MODEL(phydev) == PHY_ID_BCM50610 ||
+		     BRCM_PHY_MODEL(phydev) == PHY_ID_BCM50610M) &&
+		    BRCM_PHY_REV(phydev) >= 0x3) {
+			/* Here, bit 0 _disables_ CLK125 when set */
+			val |= BCM54XX_SHD_SCR3_DEF_CLK125;
+		} else {
+			/* Here, bit 0 _enables_ CLK125 when set */
+			val &= ~BCM54XX_SHD_SCR3_DEF_CLK125;
+		}
+	}
+
+	if (orig != val)
+		bcm54xx_shadow_write(phydev, BCM54XX_SHD_SCR3, val);
+}
+
 static int bcm54xx_config_init(struct phy_device *phydev)
 {
 	int reg, err;
@@ -336,6 +376,9 @@ static int bcm54xx_config_init(struct phy_device *phydev)
 	    (phydev->dev_flags & PHY_BRCM_CLEAR_RGMII_MODE))
 		bcm54xx_shadow_write(phydev, BCM54XX_SHD_RGMII_MODE, 0);
 
+	if (phydev->dev_flags & PHY_BRCM_RX_REFCLK_UNUSED)
+		bcm54xx_adjust_rxrefclk(phydev);
+
 	bcm54xx_phydsp_config(phydev);
 
 	return 0;
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 592b5bf09e40..369ddba95821 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -1100,7 +1100,8 @@ static int tg3_mdio_init(struct tg3 *tp)
 		break;
 	case TG3_PHY_ID_BCM50610:
 	case TG3_PHY_ID_BCM50610M:
-		phydev->dev_flags |= PHY_BRCM_CLEAR_RGMII_MODE;
+		phydev->dev_flags |= PHY_BRCM_CLEAR_RGMII_MODE |
+				     PHY_BRCM_RX_REFCLK_UNUSED;
 		if (tp->tg3_flags3 & TG3_FLG3_RGMII_STD_IBND_DISABLE)
 			phydev->dev_flags |= PHY_BRCM_STD_IBND_DISABLE;
 		if (tp->tg3_flags3 & TG3_FLG3_RGMII_EXT_IBND_RX_EN)
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 6e7ffcee9c80..59432278ded2 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -4,7 +4,7 @@
 #define PHY_BCM_FLAGS_INTF_XAUI		0x00000020
 #define PHY_BRCM_WIRESPEED_ENABLE	0x00000100
 #define PHY_BRCM_AUTO_PWRDWN_ENABLE	0x00000200
-#define PHY_BRCM_APD_CLK125_ENABLE	0x00000400
+#define PHY_BRCM_RX_REFCLK_UNUSED	0x00000400
 #define PHY_BRCM_STD_IBND_DISABLE	0x00000800
 #define PHY_BRCM_EXT_IBND_RX_ENABLE	0x00001000
 #define PHY_BRCM_EXT_IBND_TX_ENABLE	0x00002000
-- 
cgit v1.2.3


From 52fae0837153e86e4dabaf5df517a0b8b7a20bd7 Mon Sep 17 00:00:00 2001
From: Matt Carlson <mcarlson@broadcom.com>
Date: Mon, 2 Nov 2009 14:32:38 +0000
Subject: tg3 / broadcom: Optionally disable TXC if no link

This patch adds code to disable the TXC and RXC reference clocks if link
is not available.

Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Reviewed-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/broadcom.c | 5 +++++
 drivers/net/tg3.c          | 1 +
 include/linux/brcmphy.h    | 1 +
 3 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 7b10495fe8fb..f63c96a4ecb4 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -108,6 +108,7 @@
 #define BCM54XX_SHD_SCR3		0x05
 #define  BCM54XX_SHD_SCR3_DEF_CLK125	0x0001
 #define  BCM54XX_SHD_SCR3_DLLAPD_DIS	0x0002
+#define  BCM54XX_SHD_SCR3_TRDDAPD	0x0004
 
 /* 01010: Auto Power-Down */
 #define BCM54XX_SHD_APD			0x0a
@@ -362,6 +363,9 @@ static void bcm54xx_adjust_rxrefclk(struct phy_device *phydev)
 	else
 		val |= BCM54XX_SHD_SCR3_DLLAPD_DIS;
 
+	if (phydev->dev_flags & PHY_BRCM_DIS_TXCRXC_NOENRGY)
+		val |= BCM54XX_SHD_SCR3_TRDDAPD;
+
 	if (orig != val)
 		bcm54xx_shadow_write(phydev, BCM54XX_SHD_SCR3, val);
 
@@ -409,6 +413,7 @@ static int bcm54xx_config_init(struct phy_device *phydev)
 		bcm54xx_shadow_write(phydev, BCM54XX_SHD_RGMII_MODE, 0);
 
 	if ((phydev->dev_flags & PHY_BRCM_RX_REFCLK_UNUSED) ||
+	    (phydev->dev_flags & PHY_BRCM_DIS_TXCRXC_NOENRGY) ||
 	    (phydev->dev_flags & PHY_BRCM_AUTO_PWRDWN_ENABLE))
 		bcm54xx_adjust_rxrefclk(phydev);
 
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index f74bf91e78cc..50bb53de5ebd 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -1103,6 +1103,7 @@ static int tg3_mdio_init(struct tg3 *tp)
 	case TG3_PHY_ID_BCM50610M:
 		phydev->dev_flags |= PHY_BRCM_CLEAR_RGMII_MODE |
 				     PHY_BRCM_RX_REFCLK_UNUSED |
+				     PHY_BRCM_DIS_TXCRXC_NOENRGY |
 				     PHY_BRCM_AUTO_PWRDWN_ENABLE;
 		if (tp->tg3_flags3 & TG3_FLG3_RGMII_STD_IBND_DISABLE)
 			phydev->dev_flags |= PHY_BRCM_STD_IBND_DISABLE;
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 59432278ded2..2b31b91f5871 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -9,4 +9,5 @@
 #define PHY_BRCM_EXT_IBND_RX_ENABLE	0x00001000
 #define PHY_BRCM_EXT_IBND_TX_ENABLE	0x00002000
 #define PHY_BRCM_CLEAR_RGMII_MODE	0x00004000
+#define PHY_BRCM_DIS_TXCRXC_NOENRGY	0x00008000
 #define PHY_BCM_FLAGS_VALID		0x80000000
-- 
cgit v1.2.3


From 7a8b3372e29ff58ebdf94def26703afabd287f11 Mon Sep 17 00:00:00 2001
From: Sandeep Gopalpet <Sandeep.Kumar@freescale.com>
Date: Mon, 2 Nov 2009 07:03:40 +0000
Subject: gianfar: Basic Support for programming hash rules

This patch provides basic hash rules programming via the ethtool
interface.

Signed-off-by: Sandeep Gopalpet <Sandeep.Kumar@freescale.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/gianfar.c         |  73 +++++++++++++
 drivers/net/gianfar.h         |  93 +++++++++++++++++
 drivers/net/gianfar_ethtool.c | 236 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/ethtool.h       |   2 +
 4 files changed, 404 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index dc9fba09b17c..086d40dd526d 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -431,6 +431,9 @@ static const struct net_device_ops gfar_netdev_ops = {
 #endif
 };
 
+unsigned int ftp_rqfpr[MAX_FILER_IDX + 1];
+unsigned int ftp_rqfcr[MAX_FILER_IDX + 1];
+
 void lock_rx_qs(struct gfar_private *priv)
 {
 	int i = 0x0;
@@ -766,6 +769,73 @@ static unsigned int reverse_bitmap(unsigned int bit_map, unsigned int max_qs)
 	}
 	return new_bit_map;
 }
+
+u32 cluster_entry_per_class(struct gfar_private *priv, u32 rqfar, u32 class)
+{
+	u32 rqfpr = FPR_FILER_MASK;
+	u32 rqfcr = 0x0;
+
+	rqfar--;
+	rqfcr = RQFCR_CLE | RQFCR_PID_MASK | RQFCR_CMP_EXACT;
+	ftp_rqfpr[rqfar] = rqfpr;
+	ftp_rqfcr[rqfar] = rqfcr;
+	gfar_write_filer(priv, rqfar, rqfcr, rqfpr);
+
+	rqfar--;
+	rqfcr = RQFCR_CMP_NOMATCH;
+	ftp_rqfpr[rqfar] = rqfpr;
+	ftp_rqfcr[rqfar] = rqfcr;
+	gfar_write_filer(priv, rqfar, rqfcr, rqfpr);
+
+	rqfar--;
+	rqfcr = RQFCR_CMP_EXACT | RQFCR_PID_PARSE | RQFCR_CLE | RQFCR_AND;
+	rqfpr = class;
+	ftp_rqfcr[rqfar] = rqfcr;
+	ftp_rqfpr[rqfar] = rqfpr;
+	gfar_write_filer(priv, rqfar, rqfcr, rqfpr);
+
+	rqfar--;
+	rqfcr = RQFCR_CMP_EXACT | RQFCR_PID_MASK | RQFCR_AND;
+	rqfpr = class;
+	ftp_rqfcr[rqfar] = rqfcr;
+	ftp_rqfpr[rqfar] = rqfpr;
+	gfar_write_filer(priv, rqfar, rqfcr, rqfpr);
+
+	return rqfar;
+}
+
+static void gfar_init_filer_table(struct gfar_private *priv)
+{
+	int i = 0x0;
+	u32 rqfar = MAX_FILER_IDX;
+	u32 rqfcr = 0x0;
+	u32 rqfpr = FPR_FILER_MASK;
+
+	/* Default rule */
+	rqfcr = RQFCR_CMP_MATCH;
+	ftp_rqfcr[rqfar] = rqfcr;
+	ftp_rqfpr[rqfar] = rqfpr;
+	gfar_write_filer(priv, rqfar, rqfcr, rqfpr);
+
+	rqfar = cluster_entry_per_class(priv, rqfar, RQFPR_IPV6);
+	rqfar = cluster_entry_per_class(priv, rqfar, RQFPR_IPV6 | RQFPR_UDP);
+	rqfar = cluster_entry_per_class(priv, rqfar, RQFPR_IPV6 | RQFPR_TCP);
+	rqfar = cluster_entry_per_class(priv, rqfar, RQFPR_IPV4);
+	rqfar = cluster_entry_per_class(priv, rqfar, RQFPR_IPV4 | RQFPR_UDP);
+	rqfar = cluster_entry_per_class(priv, rqfar, RQFPR_IPV4 | RQFPR_TCP);
+
+	/* cur_filer_idx indicated the fisrt non-masked rule */
+	priv->cur_filer_idx = rqfar;
+
+	/* Rest are masked rules */
+	rqfcr = RQFCR_CMP_NOMATCH;
+	for (i = 0; i < rqfar; i++) {
+		ftp_rqfcr[i] = rqfcr;
+		ftp_rqfpr[i] = rqfpr;
+		gfar_write_filer(priv, i, rqfcr, rqfpr);
+	}
+}
+
 /* Set up the ethernet device structure, private data,
  * and anything else we need before we start */
 static int gfar_probe(struct of_device *ofdev,
@@ -1005,6 +1075,9 @@ static int gfar_probe(struct of_device *ofdev,
 			priv->gfargrp[i].int_name_tx[len_devname] = '\0';
 	}
 
+	/* Initialize the filer table */
+	gfar_init_filer_table(priv);
+
 	/* Create all the sysfs files */
 	gfar_init_sysfs(dev);
 
diff --git a/drivers/net/gianfar.h b/drivers/net/gianfar.h
index a2c1f963cdd6..44b63daa7ff3 100644
--- a/drivers/net/gianfar.h
+++ b/drivers/net/gianfar.h
@@ -381,6 +381,84 @@ extern const char gfar_driver_version[];
 #define BD_LFLAG(flags) ((flags) << 16)
 #define BD_LENGTH_MASK		0x0000ffff
 
+#define CLASS_CODE_UNRECOG		0x00
+#define CLASS_CODE_DUMMY1		0x01
+#define CLASS_CODE_ETHERTYPE1		0x02
+#define CLASS_CODE_ETHERTYPE2		0x03
+#define CLASS_CODE_USER_PROG1		0x04
+#define CLASS_CODE_USER_PROG2		0x05
+#define CLASS_CODE_USER_PROG3		0x06
+#define CLASS_CODE_USER_PROG4		0x07
+#define CLASS_CODE_TCP_IPV4		0x08
+#define CLASS_CODE_UDP_IPV4		0x09
+#define CLASS_CODE_AH_ESP_IPV4		0x0a
+#define CLASS_CODE_SCTP_IPV4		0x0b
+#define CLASS_CODE_TCP_IPV6		0x0c
+#define CLASS_CODE_UDP_IPV6		0x0d
+#define CLASS_CODE_AH_ESP_IPV6		0x0e
+#define CLASS_CODE_SCTP_IPV6		0x0f
+
+#define FPR_FILER_MASK	0xFFFFFFFF
+#define MAX_FILER_IDX	0xFF
+
+/* RQFCR register bits */
+#define RQFCR_GPI		0x80000000
+#define RQFCR_HASHTBL_Q		0x00000000
+#define RQFCR_HASHTBL_0		0x00020000
+#define RQFCR_HASHTBL_1		0x00040000
+#define RQFCR_HASHTBL_2		0x00060000
+#define RQFCR_HASHTBL_3		0x00080000
+#define RQFCR_HASH		0x00010000
+#define RQFCR_CLE		0x00000200
+#define RQFCR_RJE		0x00000100
+#define RQFCR_AND		0x00000080
+#define RQFCR_CMP_EXACT		0x00000000
+#define RQFCR_CMP_MATCH		0x00000020
+#define RQFCR_CMP_NOEXACT	0x00000040
+#define RQFCR_CMP_NOMATCH	0x00000060
+
+/* RQFCR PID values */
+#define	RQFCR_PID_MASK		0x00000000
+#define	RQFCR_PID_PARSE		0x00000001
+#define	RQFCR_PID_ARB		0x00000002
+#define	RQFCR_PID_DAH		0x00000003
+#define	RQFCR_PID_DAL		0x00000004
+#define	RQFCR_PID_SAH		0x00000005
+#define	RQFCR_PID_SAL		0x00000006
+#define	RQFCR_PID_ETY		0x00000007
+#define	RQFCR_PID_VID		0x00000008
+#define	RQFCR_PID_PRI		0x00000009
+#define	RQFCR_PID_TOS		0x0000000A
+#define	RQFCR_PID_L4P		0x0000000B
+#define	RQFCR_PID_DIA		0x0000000C
+#define	RQFCR_PID_SIA		0x0000000D
+#define	RQFCR_PID_DPT		0x0000000E
+#define	RQFCR_PID_SPT		0x0000000F
+
+/* RQFPR when PID is 0x0001 */
+#define RQFPR_HDR_GE_512	0x00200000
+#define RQFPR_LERR		0x00100000
+#define RQFPR_RAR		0x00080000
+#define RQFPR_RARQ		0x00040000
+#define RQFPR_AR		0x00020000
+#define RQFPR_ARQ		0x00010000
+#define RQFPR_EBC		0x00008000
+#define RQFPR_VLN		0x00004000
+#define RQFPR_CFI		0x00002000
+#define RQFPR_JUM		0x00001000
+#define RQFPR_IPF		0x00000800
+#define RQFPR_FIF		0x00000400
+#define RQFPR_IPV4		0x00000200
+#define RQFPR_IPV6		0x00000100
+#define RQFPR_ICC		0x00000080
+#define RQFPR_ICV		0x00000040
+#define RQFPR_TCP		0x00000020
+#define RQFPR_UDP		0x00000010
+#define RQFPR_TUC		0x00000008
+#define RQFPR_TUV		0x00000004
+#define RQFPR_PER		0x00000002
+#define RQFPR_EER		0x00000001
+
 /* TxBD status field bits */
 #define TXBD_READY		0x8000
 #define TXBD_PADCRC		0x4000
@@ -959,6 +1037,8 @@ struct gfar_private {
 	unsigned int rx_stash_size;
 	unsigned int rx_stash_index;
 
+	u32 cur_filer_idx;
+
 	struct sk_buff_head rx_recycle;
 
 	struct vlan_group *vlgrp;
@@ -1002,6 +1082,9 @@ struct gfar_private {
 	struct gfar_extra_stats extra_stats;
 };
 
+extern unsigned int ftp_rqfpr[MAX_FILER_IDX + 1];
+extern unsigned int ftp_rqfcr[MAX_FILER_IDX + 1];
+
 static inline u32 gfar_read(volatile unsigned __iomem *addr)
 {
 	u32 val;
@@ -1014,6 +1097,16 @@ static inline void gfar_write(volatile unsigned __iomem *addr, u32 val)
 	out_be32(addr, val);
 }
 
+static inline void gfar_write_filer(struct gfar_private *priv,
+		unsigned int far, unsigned int fcr, unsigned int fpr)
+{
+	struct gfar __iomem *regs = priv->gfargrp[0].regs;
+
+	gfar_write(&regs->rqfar, far);
+	gfar_write(&regs->rqfcr, fcr);
+	gfar_write(&regs->rqfpr, fpr);
+}
+
 extern void lock_rx_qs(struct gfar_private *priv);
 extern void lock_tx_qs(struct gfar_private *priv);
 extern void unlock_rx_qs(struct gfar_private *priv);
diff --git a/drivers/net/gianfar_ethtool.c b/drivers/net/gianfar_ethtool.c
index 562f6c20f591..1010367695e4 100644
--- a/drivers/net/gianfar_ethtool.c
+++ b/drivers/net/gianfar_ethtool.c
@@ -645,6 +645,241 @@ static int gfar_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
 }
 #endif
 
+static int gfar_ethflow_to_class(int flow_type, u64 *class)
+{
+	switch (flow_type) {
+	case TCP_V4_FLOW:
+		*class = CLASS_CODE_TCP_IPV4;
+		break;
+	case UDP_V4_FLOW:
+		*class = CLASS_CODE_UDP_IPV4;
+		break;
+	case AH_V4_FLOW:
+	case ESP_V4_FLOW:
+		*class = CLASS_CODE_AH_ESP_IPV4;
+		break;
+	case SCTP_V4_FLOW:
+		*class = CLASS_CODE_SCTP_IPV4;
+		break;
+	case TCP_V6_FLOW:
+		*class = CLASS_CODE_TCP_IPV6;
+		break;
+	case UDP_V6_FLOW:
+		*class = CLASS_CODE_UDP_IPV6;
+		break;
+	case AH_V6_FLOW:
+	case ESP_V6_FLOW:
+		*class = CLASS_CODE_AH_ESP_IPV6;
+		break;
+	case SCTP_V6_FLOW:
+		*class = CLASS_CODE_SCTP_IPV6;
+		break;
+	default:
+		return 0;
+	}
+
+	return 1;
+}
+
+static void ethflow_to_filer_rules (struct gfar_private *priv, u64 ethflow)
+{
+	u32 fcr = 0x0, fpr = FPR_FILER_MASK;
+
+	if (ethflow & RXH_L2DA) {
+		fcr = RQFCR_PID_DAH |RQFCR_CMP_NOMATCH |
+			RQFCR_HASH | RQFCR_AND | RQFCR_HASHTBL_0;
+		ftp_rqfpr[priv->cur_filer_idx] = fpr;
+		ftp_rqfcr[priv->cur_filer_idx] = fcr;
+		gfar_write_filer(priv, priv->cur_filer_idx, fcr, fpr);
+		priv->cur_filer_idx = priv->cur_filer_idx - 1;
+
+		fcr = RQFCR_PID_DAL | RQFCR_AND | RQFCR_CMP_NOMATCH |
+				RQFCR_HASH | RQFCR_AND | RQFCR_HASHTBL_0;
+		ftp_rqfpr[priv->cur_filer_idx] = fpr;
+		ftp_rqfcr[priv->cur_filer_idx] = fcr;
+		gfar_write_filer(priv, priv->cur_filer_idx, fcr, fpr);
+		priv->cur_filer_idx = priv->cur_filer_idx - 1;
+	}
+
+	if (ethflow & RXH_VLAN) {
+		fcr = RQFCR_PID_VID | RQFCR_CMP_NOMATCH | RQFCR_HASH |
+				RQFCR_AND | RQFCR_HASHTBL_0;
+		gfar_write_filer(priv, priv->cur_filer_idx, fcr, fpr);
+		ftp_rqfpr[priv->cur_filer_idx] = fpr;
+		ftp_rqfcr[priv->cur_filer_idx] = fcr;
+		priv->cur_filer_idx = priv->cur_filer_idx - 1;
+	}
+
+	if (ethflow & RXH_IP_SRC) {
+		fcr = RQFCR_PID_SIA | RQFCR_CMP_NOMATCH | RQFCR_HASH |
+			RQFCR_AND | RQFCR_HASHTBL_0;
+		ftp_rqfpr[priv->cur_filer_idx] = fpr;
+		ftp_rqfcr[priv->cur_filer_idx] = fcr;
+		gfar_write_filer(priv, priv->cur_filer_idx, fcr, fpr);
+		priv->cur_filer_idx = priv->cur_filer_idx - 1;
+	}
+
+	if (ethflow & (RXH_IP_DST)) {
+		fcr = RQFCR_PID_DIA | RQFCR_CMP_NOMATCH | RQFCR_HASH |
+			RQFCR_AND | RQFCR_HASHTBL_0;
+		ftp_rqfpr[priv->cur_filer_idx] = fpr;
+		ftp_rqfcr[priv->cur_filer_idx] = fcr;
+		gfar_write_filer(priv, priv->cur_filer_idx, fcr, fpr);
+		priv->cur_filer_idx = priv->cur_filer_idx - 1;
+	}
+
+	if (ethflow & RXH_L3_PROTO) {
+		fcr = RQFCR_PID_L4P | RQFCR_CMP_NOMATCH | RQFCR_HASH |
+			RQFCR_AND | RQFCR_HASHTBL_0;
+		ftp_rqfpr[priv->cur_filer_idx] = fpr;
+		ftp_rqfcr[priv->cur_filer_idx] = fcr;
+		gfar_write_filer(priv, priv->cur_filer_idx, fcr, fpr);
+		priv->cur_filer_idx = priv->cur_filer_idx - 1;
+	}
+
+	if (ethflow & RXH_L4_B_0_1) {
+		fcr = RQFCR_PID_SPT | RQFCR_CMP_NOMATCH | RQFCR_HASH |
+			RQFCR_AND | RQFCR_HASHTBL_0;
+		ftp_rqfpr[priv->cur_filer_idx] = fpr;
+		ftp_rqfcr[priv->cur_filer_idx] = fcr;
+		gfar_write_filer(priv, priv->cur_filer_idx, fcr, fpr);
+		priv->cur_filer_idx = priv->cur_filer_idx - 1;
+	}
+
+	if (ethflow & RXH_L4_B_2_3) {
+		fcr = RQFCR_PID_DPT | RQFCR_CMP_NOMATCH | RQFCR_HASH |
+			RQFCR_AND | RQFCR_HASHTBL_0;
+		ftp_rqfpr[priv->cur_filer_idx] = fpr;
+		ftp_rqfcr[priv->cur_filer_idx] = fcr;
+		gfar_write_filer(priv, priv->cur_filer_idx, fcr, fpr);
+		priv->cur_filer_idx = priv->cur_filer_idx - 1;
+	}
+}
+
+static int gfar_ethflow_to_filer_table(struct gfar_private *priv, u64 ethflow, u64 class)
+{
+	unsigned int last_rule_idx = priv->cur_filer_idx;
+	unsigned int cmp_rqfpr;
+	unsigned int local_rqfpr[MAX_FILER_IDX + 1];
+	unsigned int local_rqfcr[MAX_FILER_IDX + 1];
+	int i = 0x0, k = 0x0;
+	int j = MAX_FILER_IDX, l = 0x0;
+
+	switch (class) {
+	case TCP_V4_FLOW:
+		cmp_rqfpr = RQFPR_IPV4 |RQFPR_TCP;
+		break;
+	case UDP_V4_FLOW:
+		cmp_rqfpr = RQFPR_IPV4 |RQFPR_UDP;
+		break;
+	case TCP_V6_FLOW:
+		cmp_rqfpr = RQFPR_IPV6 |RQFPR_TCP;
+		break;
+	case UDP_V6_FLOW:
+		cmp_rqfpr = RQFPR_IPV6 |RQFPR_UDP;
+		break;
+	case IPV4_FLOW:
+		cmp_rqfpr = RQFPR_IPV4;
+	case IPV6_FLOW:
+		cmp_rqfpr = RQFPR_IPV6;
+		break;
+	default:
+		printk(KERN_ERR "Right now this class is not supported\n");
+		return 0;
+	}
+
+	for (i = 0; i < MAX_FILER_IDX + 1; i++) {
+		local_rqfpr[j] = ftp_rqfpr[i];
+		local_rqfcr[j] = ftp_rqfcr[i];
+		j--;
+		if ((ftp_rqfcr[i] == (RQFCR_PID_PARSE |
+			RQFCR_CLE |RQFCR_AND)) &&
+			(ftp_rqfpr[i] == cmp_rqfpr))
+			break;
+	}
+
+	if (i == MAX_FILER_IDX + 1) {
+		printk(KERN_ERR "No parse rule found, ");
+		printk(KERN_ERR "can't create hash rules\n");
+		return 0;
+	}
+
+	/* If a match was found, then it begins the starting of a cluster rule
+	 * if it was already programmed, we need to overwrite these rules
+	 */
+	for (l = i+1; l < MAX_FILER_IDX; l++) {
+		if ((ftp_rqfcr[l] & RQFCR_CLE) &&
+			!(ftp_rqfcr[l] & RQFCR_AND)) {
+			ftp_rqfcr[l] = RQFCR_CLE | RQFCR_CMP_EXACT |
+				RQFCR_HASHTBL_0 | RQFCR_PID_MASK;
+			ftp_rqfpr[l] = FPR_FILER_MASK;
+			gfar_write_filer(priv, l, ftp_rqfcr[l], ftp_rqfpr[l]);
+			break;
+		}
+
+		if (!(ftp_rqfcr[l] & RQFCR_CLE) && (ftp_rqfcr[l] & RQFCR_AND))
+			continue;
+		else {
+			local_rqfpr[j] = ftp_rqfpr[l];
+			local_rqfcr[j] = ftp_rqfcr[l];
+			j--;
+		}
+	}
+
+	priv->cur_filer_idx = l - 1;
+	last_rule_idx = l;
+
+	/* hash rules */
+	ethflow_to_filer_rules(priv, ethflow);
+
+	/* Write back the popped out rules again */
+	for (k = j+1; k < MAX_FILER_IDX; k++) {
+		ftp_rqfpr[priv->cur_filer_idx] = local_rqfpr[k];
+		ftp_rqfcr[priv->cur_filer_idx] = local_rqfcr[k];
+		gfar_write_filer(priv, priv->cur_filer_idx,
+				local_rqfcr[k], local_rqfpr[k]);
+		if (!priv->cur_filer_idx)
+			break;
+		priv->cur_filer_idx = priv->cur_filer_idx - 1;
+	}
+
+	return 1;
+}
+
+static int gfar_set_hash_opts(struct gfar_private *priv, struct ethtool_rxnfc *cmd)
+{
+	u64 class;
+
+	if (!gfar_ethflow_to_class(cmd->flow_type, &class))
+		return -EINVAL;
+
+	if (class < CLASS_CODE_USER_PROG1 ||
+			class > CLASS_CODE_SCTP_IPV6)
+		return -EINVAL;
+
+	/* write the filer rules here */
+	if (!gfar_ethflow_to_filer_table(priv, cmd->data, cmd->flow_type))
+		return -1;
+
+	return 0;
+}
+
+static int gfar_set_nfc(struct net_device *dev, struct ethtool_rxnfc *cmd)
+{
+	struct gfar_private *priv = netdev_priv(dev);
+	int ret = 0;
+
+	switch(cmd->cmd) {
+	case ETHTOOL_SRXFH:
+		ret = gfar_set_hash_opts(priv, cmd);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
 const struct ethtool_ops gfar_ethtool_ops = {
 	.get_settings = gfar_gsettings,
 	.set_settings = gfar_ssettings,
@@ -670,4 +905,5 @@ const struct ethtool_ops gfar_ethtool_ops = {
 	.get_wol = gfar_get_wol,
 	.set_wol = gfar_set_wol,
 #endif
+	.set_rxnfc = gfar_set_nfc,
 };
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index eb1a48da2d43..edd03b79e8f8 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -674,6 +674,8 @@ struct ethtool_ops {
 #define	AH_V6_FLOW	0x0b
 #define	ESP_V6_FLOW	0x0c
 #define	IP_USER_FLOW	0x0d
+#define IPV4_FLOW       0x10
+#define IPV6_FLOW       0x11
 
 /* L3-L4 network traffic flow hash options */
 #define	RXH_L2DA	(1 << 1)
-- 
cgit v1.2.3


From fb0459d75c1d0a4ba3cafdd2c754e7486968a676 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Fri, 25 Sep 2009 12:25:56 +0200
Subject: perf/core: Provide a kernel-internal interface to get to performance
 counters

There are reasons for kernel code to ask for, and use, performance
counters.
For example, in CPU freq governors this tends to be a good idea, but
there are other examples possible as well of course.

This patch adds the needed bits to do enable this functionality; they
have been tested in an experimental cpufreq driver that I'm working on,
and the changes are all that I needed to access counters properly.

[fweisbec@gmail.com: added pid to perf_event_create_kernel_counter so
that we can profile a particular task too

TODO: Have a better error reporting, don't just return NULL in fail
case.]

v2: Remove the wrong comment about the fact
    perf_event_create_kernel_counter must be called from a kernel
    thread.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: "K.Prasad" <prasad@linux.vnet.ibm.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jan Kiszka <jan.kiszka@siemens.com>
Cc: Jiri Slaby <jirislaby@gmail.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Jan Kiszka <jan.kiszka@web.de>
Cc: Avi Kivity <avi@redhat.com>
LKML-Reference: <20090925122556.2f8bd939@infradead.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/linux/perf_event.h |  6 ++++
 kernel/perf_event.c        | 75 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 80 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index df9d964c15fc..fa151d49a2ee 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -744,6 +744,12 @@ extern int hw_perf_group_sched_in(struct perf_event *group_leader,
 	       struct perf_cpu_context *cpuctx,
 	       struct perf_event_context *ctx, int cpu);
 extern void perf_event_update_userpage(struct perf_event *event);
+extern int perf_event_release_kernel(struct perf_event *event);
+extern struct perf_event *
+perf_event_create_kernel_counter(struct perf_event_attr *attr,
+				int cpu,
+				pid_t pid);
+extern u64 perf_event_read_value(struct perf_event *event);
 
 struct perf_sample_data {
 	u64				type;
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 12b5ec39bf97..02d4ff041b01 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1725,6 +1725,26 @@ static int perf_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+int perf_event_release_kernel(struct perf_event *event)
+{
+	struct perf_event_context *ctx = event->ctx;
+
+	WARN_ON_ONCE(ctx->parent_ctx);
+	mutex_lock(&ctx->mutex);
+	perf_event_remove_from_context(event);
+	mutex_unlock(&ctx->mutex);
+
+	mutex_lock(&event->owner->perf_event_mutex);
+	list_del_init(&event->owner_entry);
+	mutex_unlock(&event->owner->perf_event_mutex);
+	put_task_struct(event->owner);
+
+	free_event(event);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(perf_event_release_kernel);
+
 static int perf_event_read_size(struct perf_event *event)
 {
 	int entry = sizeof(u64); /* value */
@@ -1750,7 +1770,7 @@ static int perf_event_read_size(struct perf_event *event)
 	return size;
 }
 
-static u64 perf_event_read_value(struct perf_event *event)
+u64 perf_event_read_value(struct perf_event *event)
 {
 	struct perf_event *child;
 	u64 total = 0;
@@ -1761,6 +1781,7 @@ static u64 perf_event_read_value(struct perf_event *event)
 
 	return total;
 }
+EXPORT_SYMBOL_GPL(perf_event_read_value);
 
 static int perf_event_read_entry(struct perf_event *event,
 				   u64 read_format, char __user *buf)
@@ -4638,6 +4659,58 @@ err_put_context:
 	return err;
 }
 
+/**
+ * perf_event_create_kernel_counter
+ *
+ * @attr: attributes of the counter to create
+ * @cpu: cpu in which the counter is bound
+ * @pid: task to profile
+ */
+struct perf_event *
+perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
+				 pid_t pid)
+{
+	struct perf_event *event;
+	struct perf_event_context *ctx;
+	int err;
+
+	/*
+	 * Get the target context (task or percpu):
+	 */
+
+	ctx = find_get_context(pid, cpu);
+	if (IS_ERR(ctx))
+		return NULL ;
+
+	event = perf_event_alloc(attr, cpu, ctx, NULL,
+				     NULL, GFP_KERNEL);
+	err = PTR_ERR(event);
+	if (IS_ERR(event))
+		goto err_put_context;
+
+	event->filp = NULL;
+	WARN_ON_ONCE(ctx->parent_ctx);
+	mutex_lock(&ctx->mutex);
+	perf_install_in_context(ctx, event, cpu);
+	++ctx->generation;
+	mutex_unlock(&ctx->mutex);
+
+	event->owner = current;
+	get_task_struct(current);
+	mutex_lock(&current->perf_event_mutex);
+	list_add_tail(&event->owner_entry, &current->perf_event_list);
+	mutex_unlock(&current->perf_event_mutex);
+
+	return event;
+
+err_put_context:
+	if (err < 0)
+		put_ctx(ctx);
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
+
 /*
  * inherit a event from parent task to child task:
  */
-- 
cgit v1.2.3


From 97eaf5300b9d0cd99c310bf8c4a0f2f3296d88a3 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 18 Oct 2009 15:33:50 +0200
Subject: perf/core: Add a callback to perf events

A simple callback in a perf event can be used for multiple purposes.
For example it is useful for triggered based events like hardware
breakpoints that need a callback to dispatch a triggered breakpoint
event.

v2: Simplify a bit the callback attribution as suggested by Paul
    Mackerras

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "K.Prasad" <prasad@linux.vnet.ibm.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mundt <lethal@linux-sh.org>
---
 include/linux/perf_event.h |  7 ++++++-
 kernel/perf_event.c        | 14 ++++++++++----
 2 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fa151d49a2ee..8d54e6d25eeb 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -544,6 +544,8 @@ struct perf_pending_entry {
 	void (*func)(struct perf_pending_entry *);
 };
 
+typedef void (*perf_callback_t)(struct perf_event *, void *);
+
 /**
  * struct perf_event - performance event kernel representation:
  */
@@ -639,6 +641,8 @@ struct perf_event {
 	struct event_filter		*filter;
 #endif
 
+	perf_callback_t			callback;
+
 #endif /* CONFIG_PERF_EVENTS */
 };
 
@@ -748,7 +752,8 @@ extern int perf_event_release_kernel(struct perf_event *event);
 extern struct perf_event *
 perf_event_create_kernel_counter(struct perf_event_attr *attr,
 				int cpu,
-				pid_t pid);
+				pid_t pid,
+				perf_callback_t callback);
 extern u64 perf_event_read_value(struct perf_event *event);
 
 struct perf_sample_data {
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 02d4ff041b01..5087125e2a00 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4293,6 +4293,7 @@ perf_event_alloc(struct perf_event_attr *attr,
 		   struct perf_event_context *ctx,
 		   struct perf_event *group_leader,
 		   struct perf_event *parent_event,
+		   perf_callback_t callback,
 		   gfp_t gfpflags)
 {
 	const struct pmu *pmu;
@@ -4335,6 +4336,11 @@ perf_event_alloc(struct perf_event_attr *attr,
 
 	event->state		= PERF_EVENT_STATE_INACTIVE;
 
+	if (!callback && parent_event)
+		callback = parent_event->callback;
+	
+	event->callback	= callback;
+
 	if (attr->disabled)
 		event->state = PERF_EVENT_STATE_OFF;
 
@@ -4611,7 +4617,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	}
 
 	event = perf_event_alloc(&attr, cpu, ctx, group_leader,
-				     NULL, GFP_KERNEL);
+				     NULL, NULL, GFP_KERNEL);
 	err = PTR_ERR(event);
 	if (IS_ERR(event))
 		goto err_put_context;
@@ -4668,7 +4674,7 @@ err_put_context:
  */
 struct perf_event *
 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
-				 pid_t pid)
+				 pid_t pid, perf_callback_t callback)
 {
 	struct perf_event *event;
 	struct perf_event_context *ctx;
@@ -4683,7 +4689,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 		return NULL ;
 
 	event = perf_event_alloc(attr, cpu, ctx, NULL,
-				     NULL, GFP_KERNEL);
+				     NULL, callback, GFP_KERNEL);
 	err = PTR_ERR(event);
 	if (IS_ERR(event))
 		goto err_put_context;
@@ -4736,7 +4742,7 @@ inherit_event(struct perf_event *parent_event,
 	child_event = perf_event_alloc(&parent_event->attr,
 					   parent_event->cpu, child_ctx,
 					   group_leader, parent_event,
-					   GFP_KERNEL);
+					   NULL, GFP_KERNEL);
 	if (IS_ERR(child_event))
 		return child_event;
 	get_ctx(child_ctx);
-- 
cgit v1.2.3


From 1477b6a7edd9ffa7bba4f9779ce9a76ce92761ed Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Wed, 4 Nov 2009 16:14:16 +0900
Subject: sched: Remove unused __schedule() declaration

__schedule() had been removed.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4AF129C8.3030008@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 1 -
 kernel/kgdb.c         | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75e6e60bf583..f18102c4d0b8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -349,7 +349,6 @@ extern signed long schedule_timeout(signed long timeout);
 extern signed long schedule_timeout_interruptible(signed long timeout);
 extern signed long schedule_timeout_killable(signed long timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
-asmlinkage void __schedule(void);
 asmlinkage void schedule(void);
 extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner);
 
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 9147a3190c9d..7d7014634022 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -870,7 +870,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
 
 	/*
 	 * All threads that don't have debuggerinfo should be
-	 * in __schedule() sleeping, since all other CPUs
+	 * in schedule() sleeping, since all other CPUs
 	 * are in kgdb_wait, and thus have debuggerinfo.
 	 */
 	if (local_debuggerinfo) {
-- 
cgit v1.2.3


From 2a2bb3142d326bb28b03875cabfc49baaac9a14a Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Wed, 4 Nov 2009 16:16:10 +0900
Subject: sched: Remove unused time_sync_thresh declaration

time_sync_thresh had been removed.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4AF12A3A.5050200@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f18102c4d0b8..754b3deed02b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -171,8 +171,6 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 }
 #endif
 
-extern unsigned long long time_sync_thresh;
-
 /*
  * Task state bitmask. NOTE! These bits are also
  * encoded in fs/proc/array.c: get_task_state().
-- 
cgit v1.2.3


From 9824a2b728b63e7ff586b9fd9293c819be79f0f3 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Wed, 4 Nov 2009 16:16:54 +0900
Subject: sched: Remove unused cpu_nr_migrations()

cpu_nr_migrations() is not used, remove it.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4AF12A66.6020609@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 -
 kernel/sched.c        | 11 -----------
 2 files changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 754b3deed02b..dfc21fb76bf1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -145,7 +145,6 @@ extern unsigned long this_cpu_load(void);
 
 
 extern void calc_global_load(void);
-extern u64 cpu_nr_migrations(int cpu);
 
 extern unsigned long get_parent_ip(unsigned long addr);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 67be4d0dddaa..30fd0ba5f603 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -541,7 +541,6 @@ struct rq {
 	struct load_weight load;
 	unsigned long nr_load_updates;
 	u64 nr_switches;
-	u64 nr_migrations_in;
 
 	struct cfs_rq cfs;
 	struct rt_rq rt;
@@ -2049,7 +2048,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 #endif
 	if (old_cpu != new_cpu) {
 		p->se.nr_migrations++;
-		new_rq->nr_migrations_in++;
 #ifdef CONFIG_SCHEDSTATS
 		if (task_hot(p, old_rq->clock, NULL))
 			schedstat_inc(p, se.nr_forced2_migrations);
@@ -2988,15 +2986,6 @@ static void calc_load_account_active(struct rq *this_rq)
 	}
 }
 
-/*
- * Externally visible per-cpu scheduler statistics:
- * cpu_nr_migrations(cpu) - number of migrations into that cpu
- */
-u64 cpu_nr_migrations(int cpu)
-{
-	return cpu_rq(cpu)->nr_migrations_in;
-}
-
 /*
  * Update rq->cpu_load[] statistics. This function is usually called every
  * scheduler tick (TICK_NSEC).
-- 
cgit v1.2.3


From acc3f5d7cabbfd6cec71f0c1f9900621fa2d6ae7 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 3 Nov 2009 14:53:40 +1030
Subject: cpumask: Partition_sched_domains takes array of cpumask_var_t

Currently partition_sched_domains() takes a 'struct cpumask
*doms_new' which is a kmalloc'ed array of cpumask_t.  You can't
have such an array if 'struct cpumask' is undefined, as we plan
for CONFIG_CPUMASK_OFFSTACK=y.

So, we make this an array of cpumask_var_t instead: this is the
same for the CONFIG_CPUMASK_OFFSTACK=n case, but requires
multiple allocations for the CONFIG_CPUMASK_OFFSTACK=y case.
Hence we add alloc_sched_domains() and free_sched_domains()
functions.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <200911031453.40668.rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  8 ++++--
 kernel/cpuset.c       | 19 +++++++-------
 kernel/sched.c        | 68 ++++++++++++++++++++++++++++++++++-----------------
 3 files changed, 61 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index dfc21fb76bf1..78ba664474f3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1009,9 +1009,13 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
 	return to_cpumask(sd->span);
 }
 
-extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
+extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 				    struct sched_domain_attr *dattr_new);
 
+/* Allocate an array of sched domains, for partition_sched_domains(). */
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
+
 /* Test a flag in parent sched domain */
 static inline int test_sd_parent(struct sched_domain *sd, int flag)
 {
@@ -1029,7 +1033,7 @@ unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu);
 struct sched_domain_attr;
 
 static inline void
-partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
+partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 			struct sched_domain_attr *dattr_new)
 {
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d247381e7371..3cf2183b472d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -537,8 +537,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
  *	element of the partition (one sched domain) to be passed to
  *	partition_sched_domains().
  */
-/* FIXME: see the FIXME in partition_sched_domains() */
-static int generate_sched_domains(struct cpumask **domains,
+static int generate_sched_domains(cpumask_var_t **domains,
 			struct sched_domain_attr **attributes)
 {
 	LIST_HEAD(q);		/* queue of cpusets to be scanned */
@@ -546,7 +545,7 @@ static int generate_sched_domains(struct cpumask **domains,
 	struct cpuset **csa;	/* array of all cpuset ptrs */
 	int csn;		/* how many cpuset ptrs in csa so far */
 	int i, j, k;		/* indices for partition finding loops */
-	struct cpumask *doms;	/* resulting partition; i.e. sched domains */
+	cpumask_var_t *doms;	/* resulting partition; i.e. sched domains */
 	struct sched_domain_attr *dattr;  /* attributes for custom domains */
 	int ndoms = 0;		/* number of sched domains in result */
 	int nslot;		/* next empty doms[] struct cpumask slot */
@@ -557,7 +556,8 @@ static int generate_sched_domains(struct cpumask **domains,
 
 	/* Special case for the 99% of systems with one, full, sched domain */
 	if (is_sched_load_balance(&top_cpuset)) {
-		doms = kmalloc(cpumask_size(), GFP_KERNEL);
+		ndoms = 1;
+		doms = alloc_sched_domains(ndoms);
 		if (!doms)
 			goto done;
 
@@ -566,9 +566,8 @@ static int generate_sched_domains(struct cpumask **domains,
 			*dattr = SD_ATTR_INIT;
 			update_domain_attr_tree(dattr, &top_cpuset);
 		}
-		cpumask_copy(doms, top_cpuset.cpus_allowed);
+		cpumask_copy(doms[0], top_cpuset.cpus_allowed);
 
-		ndoms = 1;
 		goto done;
 	}
 
@@ -636,7 +635,7 @@ restart:
 	 * Now we know how many domains to create.
 	 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
 	 */
-	doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL);
+	doms = alloc_sched_domains(ndoms);
 	if (!doms)
 		goto done;
 
@@ -656,7 +655,7 @@ restart:
 			continue;
 		}
 
-		dp = doms + nslot;
+		dp = doms[nslot];
 
 		if (nslot == ndoms) {
 			static int warnings = 10;
@@ -718,7 +717,7 @@ done:
 static void do_rebuild_sched_domains(struct work_struct *unused)
 {
 	struct sched_domain_attr *attr;
-	struct cpumask *doms;
+	cpumask_var_t *doms;
 	int ndoms;
 
 	get_online_cpus();
@@ -2052,7 +2051,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
 				unsigned long phase, void *unused_cpu)
 {
 	struct sched_domain_attr *attr;
-	struct cpumask *doms;
+	cpumask_var_t *doms;
 	int ndoms;
 
 	switch (phase) {
diff --git a/kernel/sched.c b/kernel/sched.c
index 30fd0ba5f603..ae026aad145b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8846,7 +8846,7 @@ static int build_sched_domains(const struct cpumask *cpu_map)
 	return __build_sched_domains(cpu_map, NULL);
 }
 
-static struct cpumask *doms_cur;	/* current sched domains */
+static cpumask_var_t *doms_cur;	/* current sched domains */
 static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
 static struct sched_domain_attr *dattr_cur;
 				/* attribues of custom domains in 'doms_cur' */
@@ -8868,6 +8868,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void)
 	return 0;
 }
 
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
+{
+	int i;
+	cpumask_var_t *doms;
+
+	doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
+	if (!doms)
+		return NULL;
+	for (i = 0; i < ndoms; i++) {
+		if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
+			free_sched_domains(doms, i);
+			return NULL;
+		}
+	}
+	return doms;
+}
+
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
+{
+	unsigned int i;
+	for (i = 0; i < ndoms; i++)
+		free_cpumask_var(doms[i]);
+	kfree(doms);
+}
+
 /*
  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
  * For now this just excludes isolated cpus, but could be used to
@@ -8879,12 +8904,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
 
 	arch_update_cpu_topology();
 	ndoms_cur = 1;
-	doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
+	doms_cur = alloc_sched_domains(ndoms_cur);
 	if (!doms_cur)
-		doms_cur = fallback_doms;
-	cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
+		doms_cur = &fallback_doms;
+	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
 	dattr_cur = NULL;
-	err = build_sched_domains(doms_cur);
+	err = build_sched_domains(doms_cur[0]);
 	register_sched_domain_sysctl();
 
 	return err;
@@ -8934,19 +8959,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
  * doms_new[] to the current sched domain partitioning, doms_cur[].
  * It destroys each deleted domain and builds each new domain.
  *
- * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
+ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
  * The masks don't intersect (don't overlap.) We should setup one
  * sched domain for each mask. CPUs not in any of the cpumasks will
  * not be load balanced. If the same cpumask appears both in the
  * current 'doms_cur' domains and in the new 'doms_new', we can leave
  * it as it is.
  *
- * The passed in 'doms_new' should be kmalloc'd. This routine takes
- * ownership of it and will kfree it when done with it. If the caller
- * failed the kmalloc call, then it can pass in doms_new == NULL &&
- * ndoms_new == 1, and partition_sched_domains() will fallback to
- * the single partition 'fallback_doms', it also forces the domains
- * to be rebuilt.
+ * The passed in 'doms_new' should be allocated using
+ * alloc_sched_domains.  This routine takes ownership of it and will
+ * free_sched_domains it when done with it. If the caller failed the
+ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
+ * and partition_sched_domains() will fallback to the single partition
+ * 'fallback_doms', it also forces the domains to be rebuilt.
  *
  * If doms_new == NULL it will be replaced with cpu_online_mask.
  * ndoms_new == 0 is a special case for destroying existing domains,
@@ -8954,8 +8979,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
  *
  * Call with hotplug lock held
  */
-/* FIXME: Change to struct cpumask *doms_new[] */
-void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 			     struct sched_domain_attr *dattr_new)
 {
 	int i, j, n;
@@ -8974,40 +8998,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
 	/* Destroy deleted domains */
 	for (i = 0; i < ndoms_cur; i++) {
 		for (j = 0; j < n && !new_topology; j++) {
-			if (cpumask_equal(&doms_cur[i], &doms_new[j])
+			if (cpumask_equal(doms_cur[i], doms_new[j])
 			    && dattrs_equal(dattr_cur, i, dattr_new, j))
 				goto match1;
 		}
 		/* no match - a current sched domain not in new doms_new[] */
-		detach_destroy_domains(doms_cur + i);
+		detach_destroy_domains(doms_cur[i]);
 match1:
 		;
 	}
 
 	if (doms_new == NULL) {
 		ndoms_cur = 0;
-		doms_new = fallback_doms;
-		cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
+		doms_new = &fallback_doms;
+		cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map);
 		WARN_ON_ONCE(dattr_new);
 	}
 
 	/* Build new domains */
 	for (i = 0; i < ndoms_new; i++) {
 		for (j = 0; j < ndoms_cur && !new_topology; j++) {
-			if (cpumask_equal(&doms_new[i], &doms_cur[j])
+			if (cpumask_equal(doms_new[i], doms_cur[j])
 			    && dattrs_equal(dattr_new, i, dattr_cur, j))
 				goto match2;
 		}
 		/* no match - add a new doms_new */
-		__build_sched_domains(doms_new + i,
+		__build_sched_domains(doms_new[i],
 					dattr_new ? dattr_new + i : NULL);
 match2:
 		;
 	}
 
 	/* Remember the new sched domains */
-	if (doms_cur != fallback_doms)
-		kfree(doms_cur);
+	if (doms_cur != &fallback_doms)
+		free_sched_domains(doms_cur, ndoms_cur);
 	kfree(dattr_cur);	/* kfree(NULL) is safe */
 	doms_cur = doms_new;
 	dattr_cur = dattr_new;
-- 
cgit v1.2.3


From 663e69592856df53ef52969482ef413a96bc4e06 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 4 Nov 2009 14:22:21 +0100
Subject: irq: Remove unused debug_poll_all_shared_irqs()

commit 74296a8ed added this function for debug purposes, but it was
never used for anything. Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h |  6 ------
 kernel/irq/spurious.c     | 14 +-------------
 2 files changed, 1 insertion(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 7ca72b74eec7..75f3f00ac1e5 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -603,12 +603,6 @@ static inline void init_irq_proc(void)
 }
 #endif
 
-#if defined(CONFIG_GENERIC_HARDIRQS) && defined(CONFIG_DEBUG_SHIRQ)
-extern void debug_poll_all_shared_irqs(void);
-#else
-static inline void debug_poll_all_shared_irqs(void) { }
-#endif
-
 struct seq_file;
 int show_interrupts(struct seq_file *p, void *v);
 
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 114e704760fe..8996b98f9eb2 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -104,7 +104,7 @@ static int misrouted_irq(int irq)
 	return ok;
 }
 
-static void poll_all_shared_irqs(void)
+static void poll_spurious_irqs(unsigned long dummy)
 {
 	struct irq_desc *desc;
 	int i;
@@ -123,23 +123,11 @@ static void poll_all_shared_irqs(void)
 
 		try_one_irq(i, desc);
 	}
-}
-
-static void poll_spurious_irqs(unsigned long dummy)
-{
-	poll_all_shared_irqs();
 
 	mod_timer(&poll_spurious_irq_timer,
 		  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
 }
 
-#ifdef CONFIG_DEBUG_SHIRQ
-void debug_poll_all_shared_irqs(void)
-{
-	poll_all_shared_irqs();
-}
-#endif
-
 /*
  * If 99,900 of the previous 100,000 interrupts have not been handled
  * then assume that the IRQ is stuck in some manner. Drop a diagnostic
-- 
cgit v1.2.3


From c6d14c84566d6b70ad9dc1618db0dec87cca9300 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 4 Nov 2009 05:43:23 -0800
Subject: net: Introduce for_each_netdev_rcu() iterator

Adds RCU management to the list of netdevices.

Convert some for_each_netdev() users to RCU version, if
it can avoid read_lock-ing dev_base_lock

Ie:
	read_lock(&dev_base_loack);
	for_each_netdev(net, dev)
		some_action();
	read_unlock(&dev_base_lock);

becomes :

	rcu_read_lock();
	for_each_netdev_rcu(net, dev)
		some_action();
	rcu_read_unlock();


Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  2 ++
 net/core/dev.c            | 30 ++++++++++++++++--------------
 net/decnet/af_decnet.c    |  6 +++---
 net/decnet/dn_fib.c       |  6 +++---
 net/decnet/dn_route.c     |  6 +++---
 net/ipv4/devinet.c        | 30 +++++++++++-------------------
 net/ipv6/addrconf.c       | 20 +++++++-------------
 net/ipv6/anycast.c        |  6 +++---
 net/netrom/nr_route.c     | 15 ++++++++-------
 net/rose/rose_route.c     | 18 +++++++++---------
 net/sctp/protocol.c       |  6 +++---
 11 files changed, 68 insertions(+), 77 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bcf1083857fc..5077de028317 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1081,6 +1081,8 @@ extern rwlock_t				dev_base_lock;		/* Device list lock */
 
 #define for_each_netdev(net, d)		\
 		list_for_each_entry(d, &(net)->dev_base_head, dev_list)
+#define for_each_netdev_rcu(net, d)		\
+		list_for_each_entry_rcu(d, &(net)->dev_base_head, dev_list)
 #define for_each_netdev_safe(net, d, n)	\
 		list_for_each_entry_safe(d, n, &(net)->dev_base_head, dev_list)
 #define for_each_netdev_continue(net, d)		\
diff --git a/net/core/dev.c b/net/core/dev.c
index 76a1502efe67..bf629ac08b87 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -175,7 +175,7 @@ static struct list_head ptype_all __read_mostly;	/* Taps */
  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
  * semaphore.
  *
- * Pure readers hold dev_base_lock for reading.
+ * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
  *
  * Writers must hold the rtnl semaphore while they loop through the
  * dev_base_head list, and hold dev_base_lock for writing when they do the
@@ -212,7 +212,7 @@ static int list_netdevice(struct net_device *dev)
 	ASSERT_RTNL();
 
 	write_lock_bh(&dev_base_lock);
-	list_add_tail(&dev->dev_list, &net->dev_base_head);
+	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 	hlist_add_head_rcu(&dev->index_hlist,
 			   dev_index_hash(net, dev->ifindex));
@@ -229,7 +229,7 @@ static void unlist_netdevice(struct net_device *dev)
 
 	/* Unlink dev from the device chain */
 	write_lock_bh(&dev_base_lock);
-	list_del(&dev->dev_list);
+	list_del_rcu(&dev->dev_list);
 	hlist_del_rcu(&dev->name_hlist);
 	hlist_del_rcu(&dev->index_hlist);
 	write_unlock_bh(&dev_base_lock);
@@ -799,15 +799,15 @@ struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
 	struct net_device *dev, *ret;
 
 	ret = NULL;
-	read_lock(&dev_base_lock);
-	for_each_netdev(net, dev) {
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev) {
 		if (((dev->flags ^ if_flags) & mask) == 0) {
 			dev_hold(dev);
 			ret = dev;
 			break;
 		}
 	}
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 	return ret;
 }
 EXPORT_SYMBOL(dev_get_by_flags);
@@ -3077,18 +3077,18 @@ static int dev_ifconf(struct net *net, char __user *arg)
  *	in detail.
  */
 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(dev_base_lock)
+	__acquires(RCU)
 {
 	struct net *net = seq_file_net(seq);
 	loff_t off;
 	struct net_device *dev;
 
-	read_lock(&dev_base_lock);
+	rcu_read_lock();
 	if (!*pos)
 		return SEQ_START_TOKEN;
 
 	off = 1;
-	for_each_netdev(net, dev)
+	for_each_netdev_rcu(net, dev)
 		if (off++ == *pos)
 			return dev;
 
@@ -3097,16 +3097,18 @@ void *dev_seq_start(struct seq_file *seq, loff_t *pos)
 
 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct net *net = seq_file_net(seq);
+	struct net_device *dev = (v == SEQ_START_TOKEN) ?
+				  first_net_device(seq_file_net(seq)) :
+				  next_net_device((struct net_device *)v);
+
 	++*pos;
-	return v == SEQ_START_TOKEN ?
-		first_net_device(net) : next_net_device((struct net_device *)v);
+	return rcu_dereference(dev);
 }
 
 void dev_seq_stop(struct seq_file *seq, void *v)
-	__releases(dev_base_lock)
+	__releases(RCU)
 {
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 }
 
 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 664965c87e16..2e355841ca99 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -749,9 +749,9 @@ static int dn_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 
 	if (!(saddr->sdn_flags & SDF_WILD)) {
 		if (le16_to_cpu(saddr->sdn_nodeaddrl)) {
-			read_lock(&dev_base_lock);
+			rcu_read_lock();
 			ldev = NULL;
-			for_each_netdev(&init_net, dev) {
+			for_each_netdev_rcu(&init_net, dev) {
 				if (!dev->dn_ptr)
 					continue;
 				if (dn_dev_islocal(dev, dn_saddr2dn(saddr))) {
@@ -759,7 +759,7 @@ static int dn_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 					break;
 				}
 			}
-			read_unlock(&dev_base_lock);
+			rcu_read_unlock();
 			if (ldev == NULL)
 				return -EADDRNOTAVAIL;
 		}
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index 27ea2e9b080a..fd641f65e092 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -607,8 +607,8 @@ static void dn_fib_del_ifaddr(struct dn_ifaddr *ifa)
 	ASSERT_RTNL();
 
 	/* Scan device list */
-	read_lock(&dev_base_lock);
-	for_each_netdev(&init_net, dev) {
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, dev) {
 		dn_db = dev->dn_ptr;
 		if (dn_db == NULL)
 			continue;
@@ -619,7 +619,7 @@ static void dn_fib_del_ifaddr(struct dn_ifaddr *ifa)
 			}
 		}
 	}
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 
 	if (found_it == 0) {
 		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 16, ifa);
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 57662cabaf9b..860286a3921b 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -908,8 +908,8 @@ static int dn_route_output_slow(struct dst_entry **pprt, const struct flowi *old
 			dev_put(dev_out);
 			goto out;
 		}
-		read_lock(&dev_base_lock);
-		for_each_netdev(&init_net, dev) {
+		rcu_read_lock();
+		for_each_netdev_rcu(&init_net, dev) {
 			if (!dev->dn_ptr)
 				continue;
 			if (!dn_dev_islocal(dev, oldflp->fld_src))
@@ -922,7 +922,7 @@ static int dn_route_output_slow(struct dst_entry **pprt, const struct flowi *old
 			dev_out = dev;
 			break;
 		}
-		read_unlock(&dev_base_lock);
+		rcu_read_unlock();
 		if (dev_out == NULL)
 			goto out;
 		dev_hold(dev_out);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index ccccaae50b20..8aa7a134c1f1 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -876,19 +876,16 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
 		if (!addr)
 			addr = ifa->ifa_local;
 	} endfor_ifa(in_dev);
-no_in_dev:
-	rcu_read_unlock();
 
+no_in_dev:
 	if (addr)
-		goto out;
+		goto out_unlock;
 
 	/* Not loopback addresses on loopback should be preferred
 	   in this case. It is importnat that lo is the first interface
 	   in dev_base list.
 	 */
-	read_lock(&dev_base_lock);
-	rcu_read_lock();
-	for_each_netdev(net, dev) {
+	for_each_netdev_rcu(net, dev) {
 		if ((in_dev = __in_dev_get_rcu(dev)) == NULL)
 			continue;
 
@@ -896,12 +893,11 @@ no_in_dev:
 			if (ifa->ifa_scope != RT_SCOPE_LINK &&
 			    ifa->ifa_scope <= scope) {
 				addr = ifa->ifa_local;
-				goto out_unlock_both;
+				goto out_unlock;
 			}
 		} endfor_ifa(in_dev);
 	}
-out_unlock_both:
-	read_unlock(&dev_base_lock);
+out_unlock:
 	rcu_read_unlock();
 out:
 	return addr;
@@ -962,9 +958,8 @@ __be32 inet_confirm_addr(struct in_device *in_dev,
 		return confirm_addr_indev(in_dev, dst, local, scope);
 
 	net = dev_net(in_dev->dev);
-	read_lock(&dev_base_lock);
 	rcu_read_lock();
-	for_each_netdev(net, dev) {
+	for_each_netdev_rcu(net, dev) {
 		if ((in_dev = __in_dev_get_rcu(dev))) {
 			addr = confirm_addr_indev(in_dev, dst, local, scope);
 			if (addr)
@@ -972,7 +967,6 @@ __be32 inet_confirm_addr(struct in_device *in_dev,
 		}
 	}
 	rcu_read_unlock();
-	read_unlock(&dev_base_lock);
 
 	return addr;
 }
@@ -1240,18 +1234,18 @@ static void devinet_copy_dflt_conf(struct net *net, int i)
 {
 	struct net_device *dev;
 
-	read_lock(&dev_base_lock);
-	for_each_netdev(net, dev) {
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev) {
 		struct in_device *in_dev;
-		rcu_read_lock();
+
 		in_dev = __in_dev_get_rcu(dev);
 		if (in_dev && !test_bit(i, in_dev->cnf.state))
 			in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i];
-		rcu_read_unlock();
 	}
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 }
 
+/* called with RTNL locked */
 static void inet_forward_change(struct net *net)
 {
 	struct net_device *dev;
@@ -1260,7 +1254,6 @@ static void inet_forward_change(struct net *net)
 	IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on;
 	IPV4_DEVCONF_DFLT(net, FORWARDING) = on;
 
-	read_lock(&dev_base_lock);
 	for_each_netdev(net, dev) {
 		struct in_device *in_dev;
 		if (on)
@@ -1271,7 +1264,6 @@ static void inet_forward_change(struct net *net)
 			IN_DEV_CONF_SET(in_dev, FORWARDING, on);
 		rcu_read_unlock();
 	}
-	read_unlock(&dev_base_lock);
 }
 
 static int devinet_conf_proc(ctl_table *ctl, int write,
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 918648409612..024bba30de21 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -481,9 +481,8 @@ static void addrconf_forward_change(struct net *net, __s32 newf)
 	struct net_device *dev;
 	struct inet6_dev *idev;
 
-	read_lock(&dev_base_lock);
-	for_each_netdev(net, dev) {
-		rcu_read_lock();
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev) {
 		idev = __in6_dev_get(dev);
 		if (idev) {
 			int changed = (!idev->cnf.forwarding) ^ (!newf);
@@ -491,9 +490,8 @@ static void addrconf_forward_change(struct net *net, __s32 newf)
 			if (changed)
 				dev_forward_change(idev);
 		}
-		rcu_read_unlock();
 	}
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 }
 
 static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old)
@@ -1137,10 +1135,9 @@ int ipv6_dev_get_saddr(struct net *net, struct net_device *dst_dev,
 	hiscore->rule = -1;
 	hiscore->ifa = NULL;
 
-	read_lock(&dev_base_lock);
 	rcu_read_lock();
 
-	for_each_netdev(net, dev) {
+	for_each_netdev_rcu(net, dev) {
 		struct inet6_dev *idev;
 
 		/* Candidate Source Address (section 4)
@@ -1235,7 +1232,6 @@ try_nextdev:
 		read_unlock_bh(&idev->lock);
 	}
 	rcu_read_unlock();
-	read_unlock(&dev_base_lock);
 
 	if (!hiscore->ifa)
 		return -EADDRNOTAVAIL;
@@ -4052,9 +4048,8 @@ static void addrconf_disable_change(struct net *net, __s32 newf)
 	struct net_device *dev;
 	struct inet6_dev *idev;
 
-	read_lock(&dev_base_lock);
-	for_each_netdev(net, dev) {
-		rcu_read_lock();
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev) {
 		idev = __in6_dev_get(dev);
 		if (idev) {
 			int changed = (!idev->cnf.disable_ipv6) ^ (!newf);
@@ -4062,9 +4057,8 @@ static void addrconf_disable_change(struct net *net, __s32 newf)
 			if (changed)
 				dev_disable_change(idev);
 		}
-		rcu_read_unlock();
 	}
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 }
 
 static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int old)
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 1ae58bec1de0..2f00ca83f049 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -404,13 +404,13 @@ int ipv6_chk_acast_addr(struct net *net, struct net_device *dev,
 
 	if (dev)
 		return ipv6_chk_acast_dev(dev, addr);
-	read_lock(&dev_base_lock);
-	for_each_netdev(net, dev)
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev)
 		if (ipv6_chk_acast_dev(dev, addr)) {
 			found = 1;
 			break;
 		}
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 	return found;
 }
 
diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c
index 4eb1ac9a7679..aacba76070fc 100644
--- a/net/netrom/nr_route.c
+++ b/net/netrom/nr_route.c
@@ -597,15 +597,15 @@ struct net_device *nr_dev_first(void)
 {
 	struct net_device *dev, *first = NULL;
 
-	read_lock(&dev_base_lock);
-	for_each_netdev(&init_net, dev) {
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, dev) {
 		if ((dev->flags & IFF_UP) && dev->type == ARPHRD_NETROM)
 			if (first == NULL || strncmp(dev->name, first->name, 3) < 0)
 				first = dev;
 	}
 	if (first)
 		dev_hold(first);
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 
 	return first;
 }
@@ -617,16 +617,17 @@ struct net_device *nr_dev_get(ax25_address *addr)
 {
 	struct net_device *dev;
 
-	read_lock(&dev_base_lock);
-	for_each_netdev(&init_net, dev) {
-		if ((dev->flags & IFF_UP) && dev->type == ARPHRD_NETROM && ax25cmp(addr, (ax25_address *)dev->dev_addr) == 0) {
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, dev) {
+		if ((dev->flags & IFF_UP) && dev->type == ARPHRD_NETROM &&
+		    ax25cmp(addr, (ax25_address *)dev->dev_addr) == 0) {
 			dev_hold(dev);
 			goto out;
 		}
 	}
 	dev = NULL;
 out:
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 	return dev;
 }
 
diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index 9478d9b3d977..d936226916f2 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -600,13 +600,13 @@ struct net_device *rose_dev_first(void)
 {
 	struct net_device *dev, *first = NULL;
 
-	read_lock(&dev_base_lock);
-	for_each_netdev(&init_net, dev) {
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, dev) {
 		if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE)
 			if (first == NULL || strncmp(dev->name, first->name, 3) < 0)
 				first = dev;
 	}
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 
 	return first;
 }
@@ -618,8 +618,8 @@ struct net_device *rose_dev_get(rose_address *addr)
 {
 	struct net_device *dev;
 
-	read_lock(&dev_base_lock);
-	for_each_netdev(&init_net, dev) {
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, dev) {
 		if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE && rosecmp(addr, (rose_address *)dev->dev_addr) == 0) {
 			dev_hold(dev);
 			goto out;
@@ -627,7 +627,7 @@ struct net_device *rose_dev_get(rose_address *addr)
 	}
 	dev = NULL;
 out:
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 	return dev;
 }
 
@@ -635,14 +635,14 @@ static int rose_dev_exists(rose_address *addr)
 {
 	struct net_device *dev;
 
-	read_lock(&dev_base_lock);
-	for_each_netdev(&init_net, dev) {
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, dev) {
 		if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE && rosecmp(addr, (rose_address *)dev->dev_addr) == 0)
 			goto out;
 	}
 	dev = NULL;
 out:
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 	return dev != NULL;
 }
 
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index d9f4cc2c7869..fe44c57101de 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -205,14 +205,14 @@ static void sctp_get_local_addr_list(void)
 	struct list_head *pos;
 	struct sctp_af *af;
 
-	read_lock(&dev_base_lock);
-	for_each_netdev(&init_net, dev) {
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, dev) {
 		__list_for_each(pos, &sctp_address_families) {
 			af = list_entry(pos, struct sctp_af, list);
 			af->copy_addrlist(&sctp_local_addr_list, dev);
 		}
 	}
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 }
 
 /* Free the existing local addresses.  */
-- 
cgit v1.2.3


From 89e1838f5f2c2af80268a096b9a687643b0d0846 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 21 Sep 2009 10:46:22 +0200
Subject: change default: by default, use socket buffer auto tuning

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 include/linux/drbd_limits.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
index 9d067ce46960..51f47a586ad8 100644
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -70,11 +70,11 @@
   /* I don't think that a tcp send buffer of more than 10M is usefull */
 #define DRBD_SNDBUF_SIZE_MIN  0
 #define DRBD_SNDBUF_SIZE_MAX  (10<<20)
-#define DRBD_SNDBUF_SIZE_DEF  (2*65535)
+#define DRBD_SNDBUF_SIZE_DEF  0
 
 #define DRBD_RCVBUF_SIZE_MIN  0
 #define DRBD_RCVBUF_SIZE_MAX  (10<<20)
-#define DRBD_RCVBUF_SIZE_DEF  (2*65535)
+#define DRBD_RCVBUF_SIZE_DEF  0
 
   /* @4k PageSize -> 128kB - 512MB */
 #define DRBD_MAX_BUFFERS_MIN  32
-- 
cgit v1.2.3


From ed814525f2e45188964c270fc3a5a0b644f7e4a9 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Tue, 27 Oct 2009 12:37:14 +0100
Subject: Now it is equal to DRBD release 8.3.5 without compat crap

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 include/linux/drbd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 233db5c18b86..18942ad115d9 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -53,7 +53,7 @@
 
 
 extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.3.3rc2"
+#define REL_VERSION "8.3.5"
 #define API_VERSION 88
 #define PRO_VERSION_MIN 86
 #define PRO_VERSION_MAX 91
-- 
cgit v1.2.3


From d4ac42a582e46d7f86f0acb4253a310423c72c4c Mon Sep 17 00:00:00 2001
From: Kristoffer Glembo <kristoffer@gaisler.com>
Date: Wed, 4 Nov 2009 08:39:46 -0800
Subject: sparc: Support for GRLIB APBUART serial port

This patch adds support for the APBUART serial port from Aeroflex
Gaisler's IP library GRLIB. It is currently used in all LEON3 designs
(SPARC V8) but can be used on other platforms as well (which support OF).

Signed-off-by: Kristoffer Glembo <kristoffer@gaisler.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/serial/Kconfig      |  13 +
 drivers/serial/Makefile     |   1 +
 drivers/serial/apbuart.c    | 710 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/serial/apbuart.h    |  64 ++++
 include/linux/serial_core.h |   3 +
 5 files changed, 791 insertions(+)
 create mode 100644 drivers/serial/apbuart.c
 create mode 100644 drivers/serial/apbuart.h

(limited to 'include/linux')

diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index e52257257279..50943ff78f4b 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -1477,4 +1477,17 @@ config SERIAL_BCM63XX_CONSOLE
 	  If you have enabled the serial port on the bcm63xx CPU
 	  you can make it the console by answering Y to this option.
 
+config SERIAL_GRLIB_GAISLER_APBUART
+	tristate "GRLIB APBUART serial support"
+	depends on OF
+	---help---
+	Add support for the GRLIB APBUART serial port.
+
+config SERIAL_GRLIB_GAISLER_APBUART_CONSOLE
+	bool "Console on GRLIB APBUART serial port"
+	depends on SERIAL_GRLIB_GAISLER_APBUART=y
+	select SERIAL_CORE_CONSOLE
+	help
+	Support for running a console on the GRLIB APBUART
+
 endmenu
diff --git a/drivers/serial/Makefile b/drivers/serial/Makefile
index d21d5dd5d048..5548fe7df61d 100644
--- a/drivers/serial/Makefile
+++ b/drivers/serial/Makefile
@@ -81,3 +81,4 @@ obj-$(CONFIG_SERIAL_KS8695) += serial_ks8695.o
 obj-$(CONFIG_KGDB_SERIAL_CONSOLE) += kgdboc.o
 obj-$(CONFIG_SERIAL_QE) += ucc_uart.o
 obj-$(CONFIG_SERIAL_TIMBERDALE)	+= timbuart.o
+obj-$(CONFIG_SERIAL_GRLIB_GAISLER_APBUART) += apbuart.o
diff --git a/drivers/serial/apbuart.c b/drivers/serial/apbuart.c
new file mode 100644
index 000000000000..8a343045131f
--- /dev/null
+++ b/drivers/serial/apbuart.c
@@ -0,0 +1,710 @@
+/*
+ *  Driver for GRLIB serial ports (APBUART)
+ *
+ *  Based on linux/drivers/serial/amba.c
+ *
+ *  Copyright (C) 2000 Deep Blue Solutions Ltd.
+ *  Copyright (C) 2003 Konrad Eisele <eiselekd@web.de>
+ *  Copyright (C) 2006 Daniel Hellstrom <daniel@gaisler.com>, Aeroflex Gaisler AB
+ *  Copyright (C) 2008 Gilead Kutnick <kutnickg@zin-tech.com>
+ *  Copyright (C) 2009 Kristoffer Glembo <kristoffer@gaisler.com>, Aeroflex Gaisler AB
+ */
+
+#if defined(CONFIG_SERIAL_GRLIB_GAISLER_APBUART_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ)
+#define SUPPORT_SYSRQ
+#endif
+
+#include <linux/module.h>
+#include <linux/tty.h>
+#include <linux/ioport.h>
+#include <linux/init.h>
+#include <linux/serial.h>
+#include <linux/console.h>
+#include <linux/sysrq.h>
+#include <linux/kthread.h>
+#include <linux/device.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/io.h>
+#include <linux/serial_core.h>
+#include <asm/irq.h>
+#include <asm/oplib.h>
+
+#include "apbuart.h"
+
+#define SERIAL_APBUART_MAJOR	TTY_MAJOR
+#define SERIAL_APBUART_MINOR	64
+#define UART_DUMMY_RSR_RX	0x8000	/* for ignore all read */
+
+static void apbuart_tx_chars(struct uart_port *port);
+
+static void apbuart_stop_tx(struct uart_port *port)
+{
+	unsigned int cr;
+
+	cr = UART_GET_CTRL(port);
+	cr &= ~UART_CTRL_TI;
+	UART_PUT_CTRL(port, cr);
+}
+
+static void apbuart_start_tx(struct uart_port *port)
+{
+	unsigned int cr;
+
+	cr = UART_GET_CTRL(port);
+	cr |= UART_CTRL_TI;
+	UART_PUT_CTRL(port, cr);
+
+	if (UART_GET_STATUS(port) & UART_STATUS_THE)
+		apbuart_tx_chars(port);
+}
+
+static void apbuart_stop_rx(struct uart_port *port)
+{
+	unsigned int cr;
+
+	cr = UART_GET_CTRL(port);
+	cr &= ~(UART_CTRL_RI);
+	UART_PUT_CTRL(port, cr);
+}
+
+static void apbuart_enable_ms(struct uart_port *port)
+{
+	/* No modem status change interrupts for APBUART */
+}
+
+static void apbuart_rx_chars(struct uart_port *port)
+{
+	struct tty_struct *tty = port->state->port.tty;
+	unsigned int status, ch, rsr, flag;
+	unsigned int max_chars = port->fifosize;
+
+	status = UART_GET_STATUS(port);
+
+	while (UART_RX_DATA(status) && (max_chars--)) {
+
+		ch = UART_GET_CHAR(port);
+		flag = TTY_NORMAL;
+
+		port->icount.rx++;
+
+		rsr = UART_GET_STATUS(port) | UART_DUMMY_RSR_RX;
+		UART_PUT_STATUS(port, 0);
+		if (rsr & UART_STATUS_ERR) {
+
+			if (rsr & UART_STATUS_BR) {
+				rsr &= ~(UART_STATUS_FE | UART_STATUS_PE);
+				port->icount.brk++;
+				if (uart_handle_break(port))
+					goto ignore_char;
+			} else if (rsr & UART_STATUS_PE) {
+				port->icount.parity++;
+			} else if (rsr & UART_STATUS_FE) {
+				port->icount.frame++;
+			}
+			if (rsr & UART_STATUS_OE)
+				port->icount.overrun++;
+
+			rsr &= port->read_status_mask;
+
+			if (rsr & UART_STATUS_PE)
+				flag = TTY_PARITY;
+			else if (rsr & UART_STATUS_FE)
+				flag = TTY_FRAME;
+		}
+
+		if (uart_handle_sysrq_char(port, ch))
+			goto ignore_char;
+
+		uart_insert_char(port, rsr, UART_STATUS_OE, ch, flag);
+
+
+	      ignore_char:
+		status = UART_GET_STATUS(port);
+	}
+
+	tty_flip_buffer_push(tty);
+}
+
+static void apbuart_tx_chars(struct uart_port *port)
+{
+	struct circ_buf *xmit = &port->state->xmit;
+	int count;
+
+	if (port->x_char) {
+		UART_PUT_CHAR(port, port->x_char);
+		port->icount.tx++;
+		port->x_char = 0;
+		return;
+	}
+
+	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
+		apbuart_stop_tx(port);
+		return;
+	}
+
+	/* amba: fill FIFO */
+	count = port->fifosize >> 1;
+	do {
+		UART_PUT_CHAR(port, xmit->buf[xmit->tail]);
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
+		port->icount.tx++;
+		if (uart_circ_empty(xmit))
+			break;
+	} while (--count > 0);
+
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(port);
+
+	if (uart_circ_empty(xmit))
+		apbuart_stop_tx(port);
+}
+
+static irqreturn_t apbuart_int(int irq, void *dev_id)
+{
+	struct uart_port *port = dev_id;
+	unsigned int status;
+
+	spin_lock(&port->lock);
+
+	status = UART_GET_STATUS(port);
+	if (status & UART_STATUS_DR)
+		apbuart_rx_chars(port);
+	if (status & UART_STATUS_THE)
+		apbuart_tx_chars(port);
+
+	spin_unlock(&port->lock);
+
+	return IRQ_HANDLED;
+}
+
+static unsigned int apbuart_tx_empty(struct uart_port *port)
+{
+	unsigned int status = UART_GET_STATUS(port);
+	return status & UART_STATUS_THE ? TIOCSER_TEMT : 0;
+}
+
+static unsigned int apbuart_get_mctrl(struct uart_port *port)
+{
+	/* The GRLIB APBUART handles flow control in hardware */
+	return TIOCM_CAR | TIOCM_DSR | TIOCM_CTS;
+}
+
+static void apbuart_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	/* The GRLIB APBUART handles flow control in hardware */
+}
+
+static void apbuart_break_ctl(struct uart_port *port, int break_state)
+{
+	/* We don't support sending break */
+}
+
+static int apbuart_startup(struct uart_port *port)
+{
+	int retval;
+	unsigned int cr;
+
+	/* Allocate the IRQ */
+	retval = request_irq(port->irq, apbuart_int, 0, "apbuart", port);
+	if (retval)
+		return retval;
+
+	/* Finally, enable interrupts */
+	cr = UART_GET_CTRL(port);
+	UART_PUT_CTRL(port,
+		      cr | UART_CTRL_RE | UART_CTRL_TE |
+		      UART_CTRL_RI | UART_CTRL_TI);
+
+	return 0;
+}
+
+static void apbuart_shutdown(struct uart_port *port)
+{
+	unsigned int cr;
+
+	/* disable all interrupts, disable the port */
+	cr = UART_GET_CTRL(port);
+	UART_PUT_CTRL(port,
+		      cr & ~(UART_CTRL_RE | UART_CTRL_TE |
+			     UART_CTRL_RI | UART_CTRL_TI));
+
+	/* Free the interrupt */
+	free_irq(port->irq, port);
+}
+
+static void apbuart_set_termios(struct uart_port *port,
+				struct ktermios *termios, struct ktermios *old)
+{
+	unsigned int cr;
+	unsigned long flags;
+	unsigned int baud, quot;
+
+	/* Ask the core to calculate the divisor for us. */
+	baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk / 16);
+	if (baud == 0)
+		panic("invalid baudrate %i\n", port->uartclk / 16);
+
+	/* uart_get_divisor calc a *16 uart freq, apbuart is *8 */
+	quot = (uart_get_divisor(port, baud)) * 2;
+	cr = UART_GET_CTRL(port);
+	cr &= ~(UART_CTRL_PE | UART_CTRL_PS);
+
+	if (termios->c_cflag & PARENB) {
+		cr |= UART_CTRL_PE;
+		if ((termios->c_cflag & PARODD))
+			cr |= UART_CTRL_PS;
+	}
+
+	/* Enable flow control. */
+	if (termios->c_cflag & CRTSCTS)
+		cr |= UART_CTRL_FL;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	/* Update the per-port timeout. */
+	uart_update_timeout(port, termios->c_cflag, baud);
+
+	port->read_status_mask = UART_STATUS_OE;
+	if (termios->c_iflag & INPCK)
+		port->read_status_mask |= UART_STATUS_FE | UART_STATUS_PE;
+
+	/* Characters to ignore */
+	port->ignore_status_mask = 0;
+	if (termios->c_iflag & IGNPAR)
+		port->ignore_status_mask |= UART_STATUS_FE | UART_STATUS_PE;
+
+	/* Ignore all characters if CREAD is not set. */
+	if ((termios->c_cflag & CREAD) == 0)
+		port->ignore_status_mask |= UART_DUMMY_RSR_RX;
+
+	/* Set baud rate */
+	quot -= 1;
+	UART_PUT_SCAL(port, quot);
+	UART_PUT_CTRL(port, cr);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static const char *apbuart_type(struct uart_port *port)
+{
+	return port->type == PORT_APBUART ? "GRLIB/APBUART" : NULL;
+}
+
+static void apbuart_release_port(struct uart_port *port)
+{
+	release_mem_region(port->mapbase, 0x100);
+}
+
+static int apbuart_request_port(struct uart_port *port)
+{
+	return request_mem_region(port->mapbase, 0x100, "grlib-apbuart")
+	    != NULL ? 0 : -EBUSY;
+	return 0;
+}
+
+/* Configure/autoconfigure the port */
+static void apbuart_config_port(struct uart_port *port, int flags)
+{
+	if (flags & UART_CONFIG_TYPE) {
+		port->type = PORT_APBUART;
+		apbuart_request_port(port);
+	}
+}
+
+/* Verify the new serial_struct (for TIOCSSERIAL) */
+static int apbuart_verify_port(struct uart_port *port,
+			       struct serial_struct *ser)
+{
+	int ret = 0;
+	if (ser->type != PORT_UNKNOWN && ser->type != PORT_APBUART)
+		ret = -EINVAL;
+	if (ser->irq < 0 || ser->irq >= NR_IRQS)
+		ret = -EINVAL;
+	if (ser->baud_base < 9600)
+		ret = -EINVAL;
+	return ret;
+}
+
+static struct uart_ops grlib_apbuart_ops = {
+	.tx_empty = apbuart_tx_empty,
+	.set_mctrl = apbuart_set_mctrl,
+	.get_mctrl = apbuart_get_mctrl,
+	.stop_tx = apbuart_stop_tx,
+	.start_tx = apbuart_start_tx,
+	.stop_rx = apbuart_stop_rx,
+	.enable_ms = apbuart_enable_ms,
+	.break_ctl = apbuart_break_ctl,
+	.startup = apbuart_startup,
+	.shutdown = apbuart_shutdown,
+	.set_termios = apbuart_set_termios,
+	.type = apbuart_type,
+	.release_port = apbuart_release_port,
+	.request_port = apbuart_request_port,
+	.config_port = apbuart_config_port,
+	.verify_port = apbuart_verify_port,
+};
+
+static struct uart_port grlib_apbuart_ports[UART_NR];
+static struct device_node *grlib_apbuart_nodes[UART_NR];
+
+static int apbuart_scan_fifo_size(struct uart_port *port, int portnumber)
+{
+	int ctrl, loop = 0;
+	int status;
+	int fifosize;
+	unsigned long flags;
+
+	ctrl = UART_GET_CTRL(port);
+
+	/*
+	 * Enable the transceiver and wait for it to be ready to send data.
+	 * Clear interrupts so that this process will not be externally
+	 * interrupted in the middle (which can cause the transceiver to
+	 * drain prematurely).
+	 */
+
+	local_irq_save(flags);
+
+	UART_PUT_CTRL(port, ctrl | UART_CTRL_TE);
+
+	while (!UART_TX_READY(UART_GET_STATUS(port)))
+		loop++;
+
+	/*
+	 * Disable the transceiver so data isn't actually sent during the
+	 * actual test.
+	 */
+
+	UART_PUT_CTRL(port, ctrl & ~(UART_CTRL_TE));
+
+	fifosize = 1;
+	UART_PUT_CHAR(port, 0);
+
+	/*
+	 * So long as transmitting a character increments the tranceivier FIFO
+	 * length the FIFO must be at least that big. These bytes will
+	 * automatically drain off of the FIFO.
+	 */
+
+	status = UART_GET_STATUS(port);
+	while (((status >> 20) & 0x3F) == fifosize) {
+		fifosize++;
+		UART_PUT_CHAR(port, 0);
+		status = UART_GET_STATUS(port);
+	}
+
+	fifosize--;
+
+	UART_PUT_CTRL(port, ctrl);
+	local_irq_restore(flags);
+
+	if (fifosize == 0)
+		fifosize = 1;
+
+	return fifosize;
+}
+
+static void apbuart_flush_fifo(struct uart_port *port)
+{
+	int i;
+
+	for (i = 0; i < port->fifosize; i++)
+		UART_GET_CHAR(port);
+}
+
+
+/* ======================================================================== */
+/* Console driver, if enabled                                               */
+/* ======================================================================== */
+
+#ifdef CONFIG_SERIAL_GRLIB_GAISLER_APBUART_CONSOLE
+
+static void apbuart_console_putchar(struct uart_port *port, int ch)
+{
+	unsigned int status;
+	do {
+		status = UART_GET_STATUS(port);
+	} while (!UART_TX_READY(status));
+	UART_PUT_CHAR(port, ch);
+}
+
+static void
+apbuart_console_write(struct console *co, const char *s, unsigned int count)
+{
+	struct uart_port *port = &grlib_apbuart_ports[co->index];
+	unsigned int status, old_cr, new_cr;
+
+	/* First save the CR then disable the interrupts */
+	old_cr = UART_GET_CTRL(port);
+	new_cr = old_cr & ~(UART_CTRL_RI | UART_CTRL_TI);
+	UART_PUT_CTRL(port, new_cr);
+
+	uart_console_write(port, s, count, apbuart_console_putchar);
+
+	/*
+	 *      Finally, wait for transmitter to become empty
+	 *      and restore the TCR
+	 */
+	do {
+		status = UART_GET_STATUS(port);
+	} while (!UART_TX_READY(status));
+	UART_PUT_CTRL(port, old_cr);
+}
+
+static void __init
+apbuart_console_get_options(struct uart_port *port, int *baud,
+			    int *parity, int *bits)
+{
+	if (UART_GET_CTRL(port) & (UART_CTRL_RE | UART_CTRL_TE)) {
+
+		unsigned int quot, status;
+		status = UART_GET_STATUS(port);
+
+		*parity = 'n';
+		if (status & UART_CTRL_PE) {
+			if ((status & UART_CTRL_PS) == 0)
+				*parity = 'e';
+			else
+				*parity = 'o';
+		}
+
+		*bits = 8;
+		quot = UART_GET_SCAL(port) / 8;
+		*baud = port->uartclk / (16 * (quot + 1));
+	}
+}
+
+static int __init apbuart_console_setup(struct console *co, char *options)
+{
+	struct uart_port *port;
+	int baud = 38400;
+	int bits = 8;
+	int parity = 'n';
+	int flow = 'n';
+
+	pr_debug("apbuart_console_setup co=%p, co->index=%i, options=%s\n",
+		 co, co->index, options);
+
+	/*
+	 * Check whether an invalid uart number has been specified, and
+	 * if so, search for the first available port that does have
+	 * console support.
+	 */
+	if (co->index >= grlib_apbuart_port_nr)
+		co->index = 0;
+
+	port = &grlib_apbuart_ports[co->index];
+
+	spin_lock_init(&port->lock);
+
+	if (options)
+		uart_parse_options(options, &baud, &parity, &bits, &flow);
+	else
+		apbuart_console_get_options(port, &baud, &parity, &bits);
+
+	return uart_set_options(port, co, baud, parity, bits, flow);
+}
+
+static struct uart_driver grlib_apbuart_driver;
+
+static struct console grlib_apbuart_console = {
+	.name = "ttyS",
+	.write = apbuart_console_write,
+	.device = uart_console_device,
+	.setup = apbuart_console_setup,
+	.flags = CON_PRINTBUFFER,
+	.index = -1,
+	.data = &grlib_apbuart_driver,
+};
+
+
+static void grlib_apbuart_configure(void);
+
+static int __init apbuart_console_init(void)
+{
+	grlib_apbuart_configure();
+	register_console(&grlib_apbuart_console);
+	return 0;
+}
+
+console_initcall(apbuart_console_init);
+
+#define APBUART_CONSOLE	(&grlib_apbuart_console)
+#else
+#define APBUART_CONSOLE	NULL
+#endif
+
+static struct uart_driver grlib_apbuart_driver = {
+	.owner = THIS_MODULE,
+	.driver_name = "serial",
+	.dev_name = "ttyS",
+	.major = SERIAL_APBUART_MAJOR,
+	.minor = SERIAL_APBUART_MINOR,
+	.nr = UART_NR,
+	.cons = APBUART_CONSOLE,
+};
+
+
+/* ======================================================================== */
+/* OF Platform Driver                                                       */
+/* ======================================================================== */
+
+static int __devinit apbuart_probe(struct of_device *op,
+				   const struct of_device_id *match)
+{
+	int i = -1;
+	struct uart_port *port = NULL;
+
+	i = 0;
+	for (i = 0; i < grlib_apbuart_port_nr; i++) {
+		if (op->node == grlib_apbuart_nodes[i])
+			break;
+	}
+
+	port = &grlib_apbuart_ports[i];
+	port->dev = &op->dev;
+
+	uart_add_one_port(&grlib_apbuart_driver, (struct uart_port *) port);
+
+	apbuart_flush_fifo((struct uart_port *) port);
+
+	printk(KERN_INFO "grlib-apbuart at 0x%x, irq %d\n",
+	       port->mapbase, port->irq);
+	return 0;
+
+}
+
+static struct of_device_id __initdata apbuart_match[] = {
+	{
+	 .name = "GAISLER_APBUART",
+	 },
+	{},
+};
+
+static struct of_platform_driver grlib_apbuart_of_driver = {
+	.match_table = apbuart_match,
+	.probe = apbuart_probe,
+	.driver = {
+		   .owner = THIS_MODULE,
+		   .name = "grlib-apbuart",
+		   },
+};
+
+
+static void grlib_apbuart_configure(void)
+{
+	static int enum_done;
+	struct device_node *np;
+	struct uart_port *port = NULL;
+
+	int node;
+	int freq_khz;
+	int v = 0, d = 0;
+	unsigned int addr;
+	int irq, line;
+	struct amba_prom_registers *regs;
+
+	if (enum_done)
+		return;
+
+	/* Get bus frequency */
+	node = prom_getchild(prom_root_node);
+	freq_khz = prom_getint(node, "clock-frequency");
+
+	line = 0;
+	for_each_matching_node(np, apbuart_match) {
+
+		int *vendor = (int *) of_get_property(np, "vendor", NULL);
+		int *device = (int *) of_get_property(np, "device", NULL);
+		int *irqs = (int *) of_get_property(np, "interrupts", NULL);
+		regs = (struct amba_prom_registers *)
+		    of_get_property(np, "reg", NULL);
+
+		if (vendor)
+			v = *vendor;
+		if (device)
+			d = *device;
+
+		if (!irqs || !regs)
+			return;
+
+		grlib_apbuart_nodes[line] = np;
+
+		addr = regs->phys_addr;
+		irq = *irqs;
+
+		port = &grlib_apbuart_ports[line];
+
+		port->mapbase = addr;
+		port->membase = ioremap(addr, sizeof(struct grlib_apbuart_regs_map));
+		port->irq = irq;
+		port->iotype = UPIO_MEM;
+		port->ops = &grlib_apbuart_ops;
+		port->flags = UPF_BOOT_AUTOCONF;
+		port->line = line;
+		port->uartclk = freq_khz * 1000;
+		port->fifosize = apbuart_scan_fifo_size((struct uart_port *) port, line);
+		line++;
+
+		/* We support maximum UART_NR uarts ... */
+		if (line == UART_NR)
+			break;
+
+	}
+
+	enum_done = 1;
+
+	grlib_apbuart_driver.nr = grlib_apbuart_port_nr = line;
+}
+
+static int __init grlib_apbuart_init(void)
+{
+	int ret;
+
+	/* Find all APBUARTS in device the tree and initialize their ports */
+	grlib_apbuart_configure();
+
+	printk(KERN_INFO "Serial: GRLIB APBUART driver\n");
+
+	ret = uart_register_driver(&grlib_apbuart_driver);
+
+	if (ret) {
+		printk(KERN_ERR "%s: uart_register_driver failed (%i)\n",
+		       __FILE__, ret);
+		return ret;
+	}
+
+	ret = of_register_driver(&grlib_apbuart_of_driver, &of_platform_bus_type);
+
+	if (ret) {
+		printk(KERN_ERR
+		       "%s: of_register_platform_driver failed (%i)\n",
+		       __FILE__, ret);
+		uart_unregister_driver(&grlib_apbuart_driver);
+		return ret;
+	}
+
+	return ret;
+}
+
+static void __exit grlib_apbuart_exit(void)
+{
+	int i;
+
+	for (i = 0; i < grlib_apbuart_port_nr; i++)
+		uart_remove_one_port(&grlib_apbuart_driver,
+				     &grlib_apbuart_ports[i]);
+
+	uart_unregister_driver(&grlib_apbuart_driver);
+
+}
+
+module_init(grlib_apbuart_init);
+module_exit(grlib_apbuart_exit);
+
+MODULE_AUTHOR("Aeroflex Gaisler AB");
+MODULE_DESCRIPTION("GRLIB APBUART serial driver");
+MODULE_VERSION("2.1");
+MODULE_LICENSE("GPL");
diff --git a/drivers/serial/apbuart.h b/drivers/serial/apbuart.h
new file mode 100644
index 000000000000..5faf87c8d2bc
--- /dev/null
+++ b/drivers/serial/apbuart.h
@@ -0,0 +1,64 @@
+#ifndef __GRLIB_APBUART_H__
+#define __GRLIB_APBUART_H__
+
+#include <asm/io.h>
+
+#define UART_NR		8
+static int grlib_apbuart_port_nr;
+
+struct grlib_apbuart_regs_map {
+	u32 data;
+	u32 status;
+	u32 ctrl;
+	u32 scaler;
+};
+
+struct amba_prom_registers {
+	unsigned int phys_addr;
+	unsigned int reg_size;
+};
+
+/*
+ *  The following defines the bits in the APBUART Status Registers.
+ */
+#define UART_STATUS_DR   0x00000001	/* Data Ready */
+#define UART_STATUS_TSE  0x00000002	/* TX Send Register Empty */
+#define UART_STATUS_THE  0x00000004	/* TX Hold Register Empty */
+#define UART_STATUS_BR   0x00000008	/* Break Error */
+#define UART_STATUS_OE   0x00000010	/* RX Overrun Error */
+#define UART_STATUS_PE   0x00000020	/* RX Parity Error */
+#define UART_STATUS_FE   0x00000040	/* RX Framing Error */
+#define UART_STATUS_ERR  0x00000078	/* Error Mask */
+
+/*
+ *  The following defines the bits in the APBUART Ctrl Registers.
+ */
+#define UART_CTRL_RE     0x00000001	/* Receiver enable */
+#define UART_CTRL_TE     0x00000002	/* Transmitter enable */
+#define UART_CTRL_RI     0x00000004	/* Receiver interrupt enable */
+#define UART_CTRL_TI     0x00000008	/* Transmitter irq */
+#define UART_CTRL_PS     0x00000010	/* Parity select */
+#define UART_CTRL_PE     0x00000020	/* Parity enable */
+#define UART_CTRL_FL     0x00000040	/* Flow control enable */
+#define UART_CTRL_LB     0x00000080	/* Loopback enable */
+
+#define APBBASE(port) ((struct grlib_apbuart_regs_map *)((port)->membase))
+
+#define APBBASE_DATA_P(port)	(&(APBBASE(port)->data))
+#define APBBASE_STATUS_P(port)	(&(APBBASE(port)->status))
+#define APBBASE_CTRL_P(port)	(&(APBBASE(port)->ctrl))
+#define APBBASE_SCALAR_P(port)	(&(APBBASE(port)->scaler))
+
+#define UART_GET_CHAR(port)	(__raw_readl(APBBASE_DATA_P(port)))
+#define UART_PUT_CHAR(port, v)	(__raw_writel(v, APBBASE_DATA_P(port)))
+#define UART_GET_STATUS(port)	(__raw_readl(APBBASE_STATUS_P(port)))
+#define UART_PUT_STATUS(port, v)(__raw_writel(v, APBBASE_STATUS_P(port)))
+#define UART_GET_CTRL(port)	(__raw_readl(APBBASE_CTRL_P(port)))
+#define UART_PUT_CTRL(port, v)	(__raw_writel(v, APBBASE_CTRL_P(port)))
+#define UART_GET_SCAL(port)	(__raw_readl(APBBASE_SCALAR_P(port)))
+#define UART_PUT_SCAL(port, v)	(__raw_writel(v, APBBASE_SCALAR_P(port)))
+
+#define UART_RX_DATA(s)		(((s) & UART_STATUS_DR) != 0)
+#define UART_TX_READY(s)	(((s) & UART_STATUS_THE) != 0)
+
+#endif /* __GRLIB_APBUART_H__ */
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index db532ce288be..8c3dd36fe91a 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -179,6 +179,9 @@
 /* BCM63xx family SoCs */
 #define PORT_BCM63XX	89
 
+/* Aeroflex Gaisler GRLIB APBUART */
+#define PORT_APBUART    90
+
 #ifdef __KERNEL__
 
 #include <linux/compiler.h>
-- 
cgit v1.2.3


From 2a855dd01bc1539111adb7233f587c5c468732ac Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Date: Sun, 25 Oct 2009 15:37:58 +0100
Subject: signal: Fix alternate signal stack check

All architectures in the kernel increment/decrement the stack pointer
before storing values on the stack.

On architectures which have the stack grow down sas_ss_sp == sp is not
on the alternate signal stack while sas_ss_sp + sas_ss_size == sp is
on the alternate signal stack.

On architectures which have the stack grow up sas_ss_sp == sp is on
the alternate signal stack while sas_ss_sp + sas_ss_size == sp is not
on the alternate signal stack.

The current implementation fails for architectures which have the
stack grow down on the corner case where sas_ss_sp == sp.This was
reported as Debian bug #544905 on AMD64.
Simplified test case: http://download.breakpoint.cc/tc-sig-stack.c

The test case creates the following stack scenario:
   0xn0300	stack top
   0xn0200	alt stack pointer top (when switching to alt stack)
   0xn01ff	alt stack end
   0xn0100	alt stack start == stack pointer

If the signal is sent the stack pointer is pointing to the base
address of the alt stack and the kernel erroneously decides that it
has already switched to the alternate stack because of the current
check for "sp - sas_ss_sp < sas_ss_size"

On parisc (stack grows up) the scenario would be:
   0xn0200	stack pointer
   0xn01ff	alt stack end
   0xn0100	alt stack start = alt stack pointer base
   		    	  	  (when switching to alt stack)
   0xn0000	stack base

This is handled correctly by the current implementation.

[ tglx: Modified for archs which have the stack grow up (parisc) which
  	would fail with the correct implementation for stack grows
  	down. Added a check for sp >= current->sas_ss_sp which is
  	strictly not necessary but makes the code symetric for both
  	variants ]

Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: stable@kernel.org
LKML-Reference: <20091025143758.GA6653@Chamillionaire.breakpoint.cc>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75e6e60bf583..0f67914a43c9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2086,11 +2086,18 @@ static inline int is_si_special(const struct siginfo *info)
 	return info <= SEND_SIG_FORCED;
 }
 
-/* True if we are on the alternate signal stack.  */
-
+/*
+ * True if we are on the alternate signal stack.
+ */
 static inline int on_sig_stack(unsigned long sp)
 {
-	return (sp - current->sas_ss_sp < current->sas_ss_size);
+#ifdef CONFIG_STACK_GROWSUP
+	return sp >= current->sas_ss_sp &&
+		sp - current->sas_ss_sp < current->sas_ss_size;
+#else
+	return sp > current->sas_ss_sp &&
+		sp - current->sas_ss_sp <= current->sas_ss_size;
+#endif
 }
 
 static inline int sas_ss_flags(unsigned long sp)
-- 
cgit v1.2.3


From d94d9fee9fa4e66a0b91640a694b8b10177075b3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 4 Nov 2009 09:50:58 -0800
Subject: net: cleanup include/linux

This cleanup patch puts struct/union/enum opening braces,
in first line to ease grep games.

struct something
{

becomes :

struct something {

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dn.h                             |   9 +-
 include/linux/errqueue.h                       |   6 +-
 include/linux/fib_rules.h                      |   9 +-
 include/linux/filter.h                         |   6 +-
 include/linux/gen_stats.h                      |  15 ++--
 include/linux/if.h                             |  15 ++--
 include/linux/if_addr.h                        |   9 +-
 include/linux/if_addrlabel.h                   |   6 +-
 include/linux/if_arcnet.h                      |  18 ++--
 include/linux/if_arp.h                         |   3 +-
 include/linux/if_bonding.h                     |   3 +-
 include/linux/if_bridge.h                      |   9 +-
 include/linux/if_ec.h                          |  12 +--
 include/linux/if_fddi.h                        |  22 ++---
 include/linux/if_hippi.h                       |  15 ++--
 include/linux/if_link.h                        |  27 ++----
 include/linux/if_packet.h                      |  27 ++----
 include/linux/if_plip.h                        |   3 +-
 include/linux/if_pppol2tp.h                    |   3 +-
 include/linux/if_tunnel.h                      |   6 +-
 include/linux/igmp.h                           |  15 ++--
 include/linux/in.h                             |  18 ++--
 include/linux/in6.h                            |   9 +-
 include/linux/inetdevice.h                     |   9 +-
 include/linux/ip_vs.h                          |   3 +-
 include/linux/mroute.h                         |  18 ++--
 include/linux/mroute6.h                        |  15 ++--
 include/linux/neighbour.h                      |  18 ++--
 include/linux/netdevice.h                      |  30 +++----
 include/linux/netfilter.h                      |   6 +-
 include/linux/netfilter/nf_conntrack_common.h  |   6 +-
 include/linux/netfilter/nf_conntrack_ftp.h     |   3 +-
 include/linux/netfilter/nf_conntrack_sctp.h    |   3 +-
 include/linux/netfilter/nf_conntrack_tcp.h     |   3 +-
 include/linux/netfilter/nfnetlink.h            |   6 +-
 include/linux/netfilter/nfnetlink_compat.h     |   3 +-
 include/linux/netfilter/x_tables.h             |  45 ++++------
 include/linux/netfilter/xt_connbytes.h         |   3 +-
 include/linux/netfilter/xt_esp.h               |   3 +-
 include/linux/netfilter/xt_multiport.h         |   9 +-
 include/linux/netfilter/xt_policy.h            |  18 ++--
 include/linux/netfilter/xt_state.h             |   3 +-
 include/linux/netfilter/xt_string.h            |   3 +-
 include/linux/netfilter/xt_tcpudp.h            |   6 +-
 include/linux/netfilter_arp/arp_tables.h       |  21 ++---
 include/linux/netfilter_bridge/ebt_802_3.h     |   3 +-
 include/linux/netfilter_bridge/ebt_among.h     |   9 +-
 include/linux/netfilter_bridge/ebt_arpreply.h  |   3 +-
 include/linux/netfilter_bridge/ebt_ip.h        |   3 +-
 include/linux/netfilter_bridge/ebt_ip6.h       |   3 +-
 include/linux/netfilter_bridge/ebt_limit.h     |   3 +-
 include/linux/netfilter_bridge/ebt_log.h       |   3 +-
 include/linux/netfilter_bridge/ebt_mark_m.h    |   3 +-
 include/linux/netfilter_bridge/ebt_mark_t.h    |   3 +-
 include/linux/netfilter_bridge/ebt_nat.h       |   3 +-
 include/linux/netfilter_bridge/ebt_pkttype.h   |   3 +-
 include/linux/netfilter_bridge/ebt_redirect.h  |   3 +-
 include/linux/netfilter_bridge/ebt_stp.h       |   6 +-
 include/linux/netfilter_bridge/ebtables.h      |  39 +++------
 include/linux/netfilter_ipv4/ip_tables.h       |  27 ++----
 include/linux/netfilter_ipv4/ipt_SAME.h        |   3 +-
 include/linux/netfilter_ipv4/ipt_ah.h          |   3 +-
 include/linux/netfilter_ipv6/ip6_tables.h      |  27 ++----
 include/linux/netfilter_ipv6/ip6t_ah.h         |   3 +-
 include/linux/netfilter_ipv6/ip6t_frag.h       |   3 +-
 include/linux/netfilter_ipv6/ip6t_ipv6header.h |   3 +-
 include/linux/netfilter_ipv6/ip6t_mh.h         |   3 +-
 include/linux/netfilter_ipv6/ip6t_opts.h       |   3 +-
 include/linux/netfilter_ipv6/ip6t_rt.h         |   3 +-
 include/linux/netlink.h                        |  24 ++----
 include/linux/pkt_cls.h                        |  84 +++++++------------
 include/linux/pkt_sched.h                      | 111 +++++++++----------------
 include/linux/route.h                          |   3 +-
 include/linux/rtnetlink.h                      |  57 +++++--------
 include/linux/skbuff.h                         |   3 +-
 include/linux/tc_act/tc_defact.h               |   6 +-
 include/linux/tc_act/tc_gact.h                 |   9 +-
 include/linux/tc_act/tc_ipt.h                  |   3 +-
 include/linux/tc_act/tc_mirred.h               |   6 +-
 include/linux/tc_act/tc_nat.h                  |   6 +-
 include/linux/tc_act/tc_pedit.h                |   9 +-
 include/linux/tc_ematch/tc_em_cmp.h            |   6 +-
 include/linux/tc_ematch/tc_em_meta.h           |  15 ++--
 include/linux/tc_ematch/tc_em_nbyte.h          |   3 +-
 include/linux/tc_ematch/tc_em_text.h           |   3 +-
 include/linux/tcp.h                            |   6 +-
 include/linux/xfrm.h                           |  27 ++----
 87 files changed, 351 insertions(+), 694 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dn.h b/include/linux/dn.h
index fe9990823193..9c50445462d9 100644
--- a/include/linux/dn.h
+++ b/include/linux/dn.h
@@ -71,14 +71,12 @@
 /* Structures */
 
 
-struct dn_naddr 
-{
+struct dn_naddr {
 	__le16		a_len;
 	__u8 a_addr[DN_MAXADDL]; /* Two bytes little endian */
 };
 
-struct sockaddr_dn
-{
+struct sockaddr_dn {
 	__u16		sdn_family;
 	__u8		sdn_flags;
 	__u8		sdn_objnum;
@@ -101,8 +99,7 @@ struct optdata_dn {
         __u8   opt_data[16];   /* User data              */
 };
 
-struct accessdata_dn
-{
+struct accessdata_dn {
 	__u8		acc_accl;
 	__u8		acc_acc[DN_MAXACCL];
 	__u8 		acc_passl;
diff --git a/include/linux/errqueue.h b/include/linux/errqueue.h
index ec12cc74366f..034072cea853 100644
--- a/include/linux/errqueue.h
+++ b/include/linux/errqueue.h
@@ -3,8 +3,7 @@
 
 #include <linux/types.h>
 
-struct sock_extended_err
-{
+struct sock_extended_err {
 	__u32	ee_errno;	
 	__u8	ee_origin;
 	__u8	ee_type;
@@ -31,8 +30,7 @@ struct sock_extended_err
 
 #define SKB_EXT_ERR(skb) ((struct sock_exterr_skb *) ((skb)->cb))
 
-struct sock_exterr_skb
-{
+struct sock_exterr_skb {
 	union {
 		struct inet_skb_parm	h4;
 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
diff --git a/include/linux/fib_rules.h b/include/linux/fib_rules.h
index 87b606b63f1e..c7e5b700bb91 100644
--- a/include/linux/fib_rules.h
+++ b/include/linux/fib_rules.h
@@ -13,8 +13,7 @@
 /* try to find source address in routing lookups */
 #define FIB_RULE_FIND_SADDR	0x00010000
 
-struct fib_rule_hdr
-{
+struct fib_rule_hdr {
 	__u8		family;
 	__u8		dst_len;
 	__u8		src_len;
@@ -28,8 +27,7 @@ struct fib_rule_hdr
 	__u32		flags;
 };
 
-enum
-{
+enum {
 	FRA_UNSPEC,
 	FRA_DST,	/* destination address */
 	FRA_SRC,	/* source address */
@@ -52,8 +50,7 @@ enum
 
 #define FRA_MAX (__FRA_MAX - 1)
 
-enum
-{
+enum {
 	FR_ACT_UNSPEC,
 	FR_ACT_TO_TBL,		/* Pass to fixed table */
 	FR_ACT_GOTO,		/* Jump to another rule */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index bb3b4352978a..29a0e3db9f43 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -23,16 +23,14 @@
  *	the BPF code definitions which need to match so you can share filters
  */
  
-struct sock_filter	/* Filter block */
-{
+struct sock_filter {	/* Filter block */
 	__u16	code;   /* Actual filter code */
 	__u8	jt;	/* Jump true */
 	__u8	jf;	/* Jump false */
 	__u32	k;      /* Generic multiuse field */
 };
 
-struct sock_fprog	/* Required for SO_ATTACH_FILTER. */
-{
+struct sock_fprog {	/* Required for SO_ATTACH_FILTER. */
 	unsigned short		len;	/* Number of filter blocks */
 	struct sock_filter __user *filter;
 };
diff --git a/include/linux/gen_stats.h b/include/linux/gen_stats.h
index 710e901085d0..552c8a0a12d1 100644
--- a/include/linux/gen_stats.h
+++ b/include/linux/gen_stats.h
@@ -18,13 +18,11 @@ enum {
  * @bytes: number of seen bytes
  * @packets: number of seen packets
  */
-struct gnet_stats_basic
-{
+struct gnet_stats_basic {
 	__u64	bytes;
 	__u32	packets;
 };
-struct gnet_stats_basic_packed
-{
+struct gnet_stats_basic_packed {
 	__u64	bytes;
 	__u32	packets;
 } __attribute__ ((packed));
@@ -34,8 +32,7 @@ struct gnet_stats_basic_packed
  * @bps: current byte rate
  * @pps: current packet rate
  */
-struct gnet_stats_rate_est
-{
+struct gnet_stats_rate_est {
 	__u32	bps;
 	__u32	pps;
 };
@@ -48,8 +45,7 @@ struct gnet_stats_rate_est
  * @requeues: number of requeues
  * @overlimits: number of enqueues over the limit
  */
-struct gnet_stats_queue
-{
+struct gnet_stats_queue {
 	__u32	qlen;
 	__u32	backlog;
 	__u32	drops;
@@ -62,8 +58,7 @@ struct gnet_stats_queue
  * @interval: sampling period
  * @ewma_log: the log of measurement window weight
  */
-struct gnet_estimator
-{
+struct gnet_estimator {
 	signed char	interval;
 	unsigned char	ewma_log;
 };
diff --git a/include/linux/if.h b/include/linux/if.h
index b9a6229f3be7..3b2a46bf8f8d 100644
--- a/include/linux/if.h
+++ b/include/linux/if.h
@@ -125,8 +125,7 @@ enum {
  *	being very small might be worth keeping for clean configuration.
  */
 
-struct ifmap 
-{
+struct ifmap {
 	unsigned long mem_start;
 	unsigned long mem_end;
 	unsigned short base_addr; 
@@ -136,8 +135,7 @@ struct ifmap
 	/* 3 bytes spare */
 };
 
-struct if_settings
-{
+struct if_settings {
 	unsigned int type;	/* Type of physical device or protocol */
 	unsigned int size;	/* Size of the data allocated by the caller */
 	union {
@@ -161,8 +159,7 @@ struct if_settings
  * remainder may be interface specific.
  */
 
-struct ifreq 
-{
+struct ifreq {
 #define IFHWADDRLEN	6
 	union
 	{
@@ -211,11 +208,9 @@ struct ifreq
  * must know all networks accessible).
  */
 
-struct ifconf 
-{
+struct ifconf  {
 	int	ifc_len;			/* size of buffer	*/
-	union 
-	{
+	union {
 		char __user *ifcu_buf;
 		struct ifreq __user *ifcu_req;
 	} ifc_ifcu;
diff --git a/include/linux/if_addr.h b/include/linux/if_addr.h
index fd9740466757..23357ab81a77 100644
--- a/include/linux/if_addr.h
+++ b/include/linux/if_addr.h
@@ -4,8 +4,7 @@
 #include <linux/types.h>
 #include <linux/netlink.h>
 
-struct ifaddrmsg
-{
+struct ifaddrmsg {
 	__u8		ifa_family;
 	__u8		ifa_prefixlen;	/* The prefix length		*/
 	__u8		ifa_flags;	/* Flags			*/
@@ -20,8 +19,7 @@ struct ifaddrmsg
  * but for point-to-point IFA_ADDRESS is DESTINATION address,
  * local address is supplied in IFA_LOCAL attribute.
  */
-enum
-{
+enum {
 	IFA_UNSPEC,
 	IFA_ADDRESS,
 	IFA_LOCAL,
@@ -47,8 +45,7 @@ enum
 #define IFA_F_TENTATIVE		0x40
 #define IFA_F_PERMANENT		0x80
 
-struct ifa_cacheinfo
-{
+struct ifa_cacheinfo {
 	__u32	ifa_prefered;
 	__u32	ifa_valid;
 	__u32	cstamp; /* created timestamp, hundredths of seconds */
diff --git a/include/linux/if_addrlabel.h b/include/linux/if_addrlabel.h
index 89571f65d6de..54580c298187 100644
--- a/include/linux/if_addrlabel.h
+++ b/include/linux/if_addrlabel.h
@@ -12,8 +12,7 @@
 
 #include <linux/types.h>
 
-struct ifaddrlblmsg
-{
+struct ifaddrlblmsg {
 	__u8		ifal_family;		/* Address family */
 	__u8		__ifal_reserved;	/* Reserved */
 	__u8		ifal_prefixlen;		/* Prefix length */
@@ -22,8 +21,7 @@ struct ifaddrlblmsg
 	__u32		ifal_seq;		/* sequence number */
 };
 
-enum
-{
+enum {
 	IFAL_ADDRESS = 1,
 	IFAL_LABEL = 2,
 	__IFAL_MAX
diff --git a/include/linux/if_arcnet.h b/include/linux/if_arcnet.h
index 0835debab115..46e34bd0e783 100644
--- a/include/linux/if_arcnet.h
+++ b/include/linux/if_arcnet.h
@@ -56,8 +56,7 @@
 /*
  * The RFC1201-specific components of an arcnet packet header.
  */
-struct arc_rfc1201
-{
+struct arc_rfc1201 {
     __u8  proto;		/* protocol ID field - varies		*/
     __u8  split_flag;	/* for use with split packets		*/
     __be16   sequence;		/* sequence number			*/
@@ -69,8 +68,7 @@ struct arc_rfc1201
 /*
  * The RFC1051-specific components.
  */
-struct arc_rfc1051
-{
+struct arc_rfc1051 {
     __u8 proto;		/* ARC_P_RFC1051_ARP/RFC1051_IP	*/
     __u8 payload[0];		/* 507 bytes			*/
 };
@@ -81,8 +79,7 @@ struct arc_rfc1051
  * The ethernet-encap-specific components.  We have a real ethernet header
  * and some data.
  */
-struct arc_eth_encap
-{
+struct arc_eth_encap {
     __u8 proto;		/* Always ARC_P_ETHER			*/
     struct ethhdr eth;		/* standard ethernet header (yuck!)	*/
     __u8 payload[0];		/* 493 bytes				*/
@@ -90,8 +87,7 @@ struct arc_eth_encap
 #define ETH_ENCAP_HDR_SIZE 14
 
 
-struct arc_cap
-{
+struct arc_cap {
 	__u8 proto;
 	__u8 cookie[sizeof(int)];   /* Actually NOT sent over the network */
 	union {
@@ -108,8 +104,7 @@ struct arc_cap
  * the _end_ of the 512-byte buffer.  We hide this complexity inside the
  * driver.
  */
-struct arc_hardware
-{
+struct arc_hardware {
     __u8  source,		/* source ARCnet - filled in automagically */
              dest,		/* destination ARCnet - 0 for broadcast    */
     	     offset[2];		/* offset bytes (some weird semantics)     */
@@ -120,8 +115,7 @@ struct arc_hardware
  * This is an ARCnet frame header, as seen by the kernel (and userspace,
  * when you do a raw packet capture).
  */
-struct archdr
-{
+struct archdr {
     /* hardware requirements */
     struct arc_hardware hard;
      
diff --git a/include/linux/if_arp.h b/include/linux/if_arp.h
index 282eb37e2dec..e80b7f88f7c6 100644
--- a/include/linux/if_arp.h
+++ b/include/linux/if_arp.h
@@ -133,8 +133,7 @@ struct arpreq_old {
  *	This structure defines an ethernet arp header.
  */
 
-struct arphdr
-{
+struct arphdr {
 	__be16		ar_hrd;		/* format of hardware address	*/
 	__be16		ar_pro;		/* format of protocol address	*/
 	unsigned char	ar_hln;		/* length of hardware address	*/
diff --git a/include/linux/if_bonding.h b/include/linux/if_bonding.h
index 65c2d247068b..cd525fae3c98 100644
--- a/include/linux/if_bonding.h
+++ b/include/linux/if_bonding.h
@@ -94,8 +94,7 @@ typedef struct ifbond {
 	__s32 miimon;
 } ifbond;
 
-typedef struct ifslave
-{
+typedef struct ifslave {
 	__s32 slave_id; /* Used as an IN param to the BOND_SLAVE_INFO_QUERY ioctl */
 	char slave_name[IFNAMSIZ];
 	__s8 link;
diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 6badb3e2c4e4..938b7e81df95 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -49,8 +49,7 @@
 #define BR_STATE_FORWARDING 3
 #define BR_STATE_BLOCKING 4
 
-struct __bridge_info
-{
+struct __bridge_info {
 	__u64 designated_root;
 	__u64 bridge_id;
 	__u32 root_path_cost;
@@ -72,8 +71,7 @@ struct __bridge_info
 	__u32 gc_timer_value;
 };
 
-struct __port_info
-{
+struct __port_info {
 	__u64 designated_root;
 	__u64 designated_bridge;
 	__u16 port_id;
@@ -89,8 +87,7 @@ struct __port_info
 	__u32 hold_timer_value;
 };
 
-struct __fdb_entry
-{
+struct __fdb_entry {
 	__u8 mac_addr[6];
 	__u8 port_no;
 	__u8 is_local;
diff --git a/include/linux/if_ec.h b/include/linux/if_ec.h
index e7499aa79783..d85f9f48129f 100644
--- a/include/linux/if_ec.h
+++ b/include/linux/if_ec.h
@@ -5,14 +5,12 @@
 
 /* User visible stuff. Glibc provides its own but libc5 folk will use these */
 
-struct ec_addr
-{
+struct ec_addr {
   unsigned char station;		/* Station number.  */
   unsigned char net;			/* Network number.  */
 };
 
-struct sockaddr_ec
-{
+struct sockaddr_ec {
   unsigned short sec_family;
   unsigned char port;			/* Port number.  */
   unsigned char cb;			/* Control/flag byte.  */
@@ -37,8 +35,7 @@ struct sockaddr_ec
 #define EC_HLEN				6
 
 /* This is what an Econet frame looks like on the wire. */
-struct ec_framehdr 
-{
+struct ec_framehdr {
   unsigned char dst_stn;
   unsigned char dst_net;
   unsigned char src_stn;
@@ -62,8 +59,7 @@ static inline struct econet_sock *ec_sk(const struct sock *sk)
 	return (struct econet_sock *)sk;
 }
 
-struct ec_device
-{
+struct ec_device {
   unsigned char station, net;		/* Econet protocol address */
 };
 
diff --git a/include/linux/if_fddi.h b/include/linux/if_fddi.h
index 45de1046dbbf..5459c5c09930 100644
--- a/include/linux/if_fddi.h
+++ b/include/linux/if_fddi.h
@@ -63,36 +63,32 @@
 #define FDDI_UI_CMD			0x03
 
 /* Define 802.2 Type 1 header */
-struct fddi_8022_1_hdr
-	{
+struct fddi_8022_1_hdr {
 	__u8	dsap;					/* destination service access point */
 	__u8	ssap;					/* source service access point */
 	__u8	ctrl;					/* control byte #1 */
-	} __attribute__ ((packed));
+} __attribute__ ((packed));
 
 /* Define 802.2 Type 2 header */
-struct fddi_8022_2_hdr
-	{
+struct fddi_8022_2_hdr {
 	__u8	dsap;					/* destination service access point */
 	__u8	ssap;					/* source service access point */
 	__u8	ctrl_1;					/* control byte #1 */
 	__u8	ctrl_2;					/* control byte #2 */
-	} __attribute__ ((packed));
+} __attribute__ ((packed));
 
 /* Define 802.2 SNAP header */
 #define FDDI_K_OUI_LEN	3
-struct fddi_snap_hdr
-	{
+struct fddi_snap_hdr {
 	__u8	dsap;					/* always 0xAA */
 	__u8	ssap;					/* always 0xAA */
 	__u8	ctrl;					/* always 0x03 */
 	__u8	oui[FDDI_K_OUI_LEN];	/* organizational universal id */
 	__be16	ethertype;				/* packet type ID field */
-	} __attribute__ ((packed));
+} __attribute__ ((packed));
 
 /* Define FDDI LLC frame header */
-struct fddihdr
-	{
+struct fddihdr {
 	__u8	fc;						/* frame control */
 	__u8	daddr[FDDI_K_ALEN];		/* destination address */
 	__u8	saddr[FDDI_K_ALEN];		/* source address */
@@ -102,7 +98,7 @@ struct fddihdr
 		struct fddi_8022_2_hdr		llc_8022_2;
 		struct fddi_snap_hdr		llc_snap;
 		} hdr;
-	} __attribute__ ((packed));
+} __attribute__ ((packed));
 
 #ifdef __KERNEL__
 #include <linux/netdevice.h>
@@ -197,7 +193,7 @@ struct fddi_statistics {
 	__u32	port_pc_withhold[2];
 	__u32	port_ler_flag[2];
 	__u32	port_hardware_present[2];
-	};
+};
 #endif /* __KERNEL__ */
 
 #endif	/* _LINUX_IF_FDDI_H */
diff --git a/include/linux/if_hippi.h b/include/linux/if_hippi.h
index 4a7c9940b080..8d038eb8db5c 100644
--- a/include/linux/if_hippi.h
+++ b/include/linux/if_hippi.h
@@ -51,8 +51,7 @@
  *	HIPPI statistics collection data. 
  */
  
-struct hipnet_statistics
-{
+struct hipnet_statistics {
 	int	rx_packets;		/* total packets received	*/
 	int	tx_packets;		/* total packets transmitted	*/
 	int	rx_errors;		/* bad packets received		*/
@@ -77,8 +76,7 @@ struct hipnet_statistics
 };
 
 
-struct hippi_fp_hdr
-{
+struct hippi_fp_hdr {
 #if 0
 	__u8		ulp;				/* must contain 4 */
 #if defined (__BIG_ENDIAN_BITFIELD)
@@ -108,8 +106,7 @@ struct hippi_fp_hdr
 	__be32		d2_size;
 } __attribute__ ((packed));
 
-struct hippi_le_hdr
-{
+struct hippi_le_hdr {
 #if defined (__BIG_ENDIAN_BITFIELD)
 	__u8		fc:3;
 	__u8		double_wide:1;
@@ -139,8 +136,7 @@ struct hippi_le_hdr
  * Looks like the dsap and ssap fields have been swapped by mistake in
  * RFC 2067 "IP over HIPPI".
  */
-struct hippi_snap_hdr
-{
+struct hippi_snap_hdr {
 	__u8	dsap;			/* always 0xAA */
 	__u8	ssap;			/* always 0xAA */
 	__u8	ctrl;			/* always 0x03 */
@@ -148,8 +144,7 @@ struct hippi_snap_hdr
 	__be16	ethertype;		/* packet type ID field */
 } __attribute__ ((packed));
 
-struct hippi_hdr
-{
+struct hippi_hdr {
 	struct hippi_fp_hdr	fp;
 	struct hippi_le_hdr	le;
 	struct hippi_snap_hdr	snap;
diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 176c5182c515..1d3b2425f661 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -5,8 +5,7 @@
 #include <linux/netlink.h>
 
 /* The struct should be in sync with struct net_device_stats */
-struct rtnl_link_stats
-{
+struct rtnl_link_stats {
 	__u32	rx_packets;		/* total packets received	*/
 	__u32	tx_packets;		/* total packets transmitted	*/
 	__u32	rx_bytes;		/* total bytes received 	*/
@@ -39,8 +38,7 @@ struct rtnl_link_stats
 };
 
 /* The struct should be in sync with struct ifmap */
-struct rtnl_link_ifmap
-{
+struct rtnl_link_ifmap {
 	__u64	mem_start;
 	__u64	mem_end;
 	__u64	base_addr;
@@ -49,8 +47,7 @@ struct rtnl_link_ifmap
 	__u8	port;
 };
 
-enum
-{
+enum {
 	IFLA_UNSPEC,
 	IFLA_ADDRESS,
 	IFLA_BROADCAST,
@@ -123,8 +120,7 @@ enum
  */
 
 /* Subtype attributes for IFLA_PROTINFO */
-enum
-{
+enum {
 	IFLA_INET6_UNSPEC,
 	IFLA_INET6_FLAGS,	/* link flags			*/
 	IFLA_INET6_CONF,	/* sysctl parameters		*/
@@ -137,16 +133,14 @@ enum
 
 #define IFLA_INET6_MAX	(__IFLA_INET6_MAX - 1)
 
-struct ifla_cacheinfo
-{
+struct ifla_cacheinfo {
 	__u32	max_reasm_len;
 	__u32	tstamp;		/* ipv6InterfaceTable updated timestamp */
 	__u32	reachable_time;
 	__u32	retrans_time;
 };
 
-enum
-{
+enum {
 	IFLA_INFO_UNSPEC,
 	IFLA_INFO_KIND,
 	IFLA_INFO_DATA,
@@ -158,8 +152,7 @@ enum
 
 /* VLAN section */
 
-enum
-{
+enum {
 	IFLA_VLAN_UNSPEC,
 	IFLA_VLAN_ID,
 	IFLA_VLAN_FLAGS,
@@ -175,8 +168,7 @@ struct ifla_vlan_flags {
 	__u32	mask;
 };
 
-enum
-{
+enum {
 	IFLA_VLAN_QOS_UNSPEC,
 	IFLA_VLAN_QOS_MAPPING,
 	__IFLA_VLAN_QOS_MAX
@@ -184,8 +176,7 @@ enum
 
 #define IFLA_VLAN_QOS_MAX	(__IFLA_VLAN_QOS_MAX - 1)
 
-struct ifla_vlan_qos_mapping
-{
+struct ifla_vlan_qos_mapping {
 	__u32 from;
 	__u32 to;
 };
diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
index dea7d6b7cf98..4021d47cc437 100644
--- a/include/linux/if_packet.h
+++ b/include/linux/if_packet.h
@@ -3,15 +3,13 @@
 
 #include <linux/types.h>
 
-struct sockaddr_pkt
-{
+struct sockaddr_pkt {
 	unsigned short spkt_family;
 	unsigned char spkt_device[14];
 	__be16 spkt_protocol;
 };
 
-struct sockaddr_ll
-{
+struct sockaddr_ll {
 	unsigned short	sll_family;
 	__be16		sll_protocol;
 	int		sll_ifindex;
@@ -49,14 +47,12 @@ struct sockaddr_ll
 #define PACKET_TX_RING			13
 #define PACKET_LOSS			14
 
-struct tpacket_stats
-{
+struct tpacket_stats {
 	unsigned int	tp_packets;
 	unsigned int	tp_drops;
 };
 
-struct tpacket_auxdata
-{
+struct tpacket_auxdata {
 	__u32		tp_status;
 	__u32		tp_len;
 	__u32		tp_snaplen;
@@ -78,8 +74,7 @@ struct tpacket_auxdata
 #define TP_STATUS_SENDING	0x2
 #define TP_STATUS_WRONG_FORMAT	0x4
 
-struct tpacket_hdr
-{
+struct tpacket_hdr {
 	unsigned long	tp_status;
 	unsigned int	tp_len;
 	unsigned int	tp_snaplen;
@@ -93,8 +88,7 @@ struct tpacket_hdr
 #define TPACKET_ALIGN(x)	(((x)+TPACKET_ALIGNMENT-1)&~(TPACKET_ALIGNMENT-1))
 #define TPACKET_HDRLEN		(TPACKET_ALIGN(sizeof(struct tpacket_hdr)) + sizeof(struct sockaddr_ll))
 
-struct tpacket2_hdr
-{
+struct tpacket2_hdr {
 	__u32		tp_status;
 	__u32		tp_len;
 	__u32		tp_snaplen;
@@ -107,8 +101,7 @@ struct tpacket2_hdr
 
 #define TPACKET2_HDRLEN		(TPACKET_ALIGN(sizeof(struct tpacket2_hdr)) + sizeof(struct sockaddr_ll))
 
-enum tpacket_versions
-{
+enum tpacket_versions {
 	TPACKET_V1,
 	TPACKET_V2,
 };
@@ -126,16 +119,14 @@ enum tpacket_versions
    - Pad to align to TPACKET_ALIGNMENT=16
  */
 
-struct tpacket_req
-{
+struct tpacket_req {
 	unsigned int	tp_block_size;	/* Minimal size of contiguous block */
 	unsigned int	tp_block_nr;	/* Number of blocks */
 	unsigned int	tp_frame_size;	/* Size of frame */
 	unsigned int	tp_frame_nr;	/* Total number of frames */
 };
 
-struct packet_mreq
-{
+struct packet_mreq {
 	int		mr_ifindex;
 	unsigned short	mr_type;
 	unsigned short	mr_alen;
diff --git a/include/linux/if_plip.h b/include/linux/if_plip.h
index 153a649915a2..6298c7e88b2b 100644
--- a/include/linux/if_plip.h
+++ b/include/linux/if_plip.h
@@ -15,8 +15,7 @@
 
 #define	SIOCDEVPLIP	SIOCDEVPRIVATE
 
-struct plipconf
-{
+struct plipconf {
 	unsigned short pcmd;
 	unsigned long  nibble;
 	unsigned long  trigger;
diff --git a/include/linux/if_pppol2tp.h b/include/linux/if_pppol2tp.h
index 3a14b088c8ec..c58baea4a25b 100644
--- a/include/linux/if_pppol2tp.h
+++ b/include/linux/if_pppol2tp.h
@@ -24,8 +24,7 @@
 /* Structure used to connect() the socket to a particular tunnel UDP
  * socket.
  */
-struct pppol2tp_addr
-{
+struct pppol2tp_addr {
 	__kernel_pid_t	pid;		/* pid that owns the fd.
 					 * 0 => current */
 	int	fd;			/* FD of UDP socket to use */
diff --git a/include/linux/if_tunnel.h b/include/linux/if_tunnel.h
index 8d76cb4c86fa..1822d635be6b 100644
--- a/include/linux/if_tunnel.h
+++ b/include/linux/if_tunnel.h
@@ -30,8 +30,7 @@
 #define GRE_FLAGS	__cpu_to_be16(0x00F8)
 #define GRE_VERSION	__cpu_to_be16(0x0007)
 
-struct ip_tunnel_parm
-{
+struct ip_tunnel_parm {
 	char			name[IFNAMSIZ];
 	int			link;
 	__be16			i_flags;
@@ -63,8 +62,7 @@ struct ip_tunnel_6rd {
 	__u16			relay_prefixlen;
 };
 
-enum
-{
+enum {
 	IFLA_GRE_UNSPEC,
 	IFLA_GRE_LINK,
 	IFLA_GRE_IFLAGS,
diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index fe158e0e20e6..724c27e5d173 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -27,8 +27,7 @@
  *	Header in on cable format
  */
 
-struct igmphdr
-{
+struct igmphdr {
 	__u8 type;
 	__u8 code;		/* For newer IGMP */
 	__sum16 csum;
@@ -151,8 +150,7 @@ static inline struct igmpv3_query *
 extern int sysctl_igmp_max_memberships;
 extern int sysctl_igmp_max_msf;
 
-struct ip_sf_socklist
-{
+struct ip_sf_socklist {
 	unsigned int		sl_max;
 	unsigned int		sl_count;
 	__be32			sl_addr[0];
@@ -167,16 +165,14 @@ struct ip_sf_socklist
    this list never used in fast path code
  */
 
-struct ip_mc_socklist
-{
+struct ip_mc_socklist {
 	struct ip_mc_socklist	*next;
 	struct ip_mreqn		multi;
 	unsigned int		sfmode;		/* MCAST_{INCLUDE,EXCLUDE} */
 	struct ip_sf_socklist	*sflist;
 };
 
-struct ip_sf_list
-{
+struct ip_sf_list {
 	struct ip_sf_list	*sf_next;
 	__be32			sf_inaddr;
 	unsigned long		sf_count[2];	/* include/exclude counts */
@@ -185,8 +181,7 @@ struct ip_sf_list
 	unsigned char		sf_crcount;	/* retrans. left to send */
 };
 
-struct ip_mc_list
-{
+struct ip_mc_list {
 	struct in_device	*interface;
 	__be32			multiaddr;
 	struct ip_sf_list	*sources;
diff --git a/include/linux/in.h b/include/linux/in.h
index cf196da04ec9..b615649db129 100644
--- a/include/linux/in.h
+++ b/include/linux/in.h
@@ -118,14 +118,12 @@ struct in_addr {
 
 /* Request struct for multicast socket ops */
 
-struct ip_mreq 
-{
+struct ip_mreq  {
 	struct in_addr imr_multiaddr;	/* IP multicast address of group */
 	struct in_addr imr_interface;	/* local IP address of interface */
 };
 
-struct ip_mreqn
-{
+struct ip_mreqn {
 	struct in_addr	imr_multiaddr;		/* IP multicast address of group */
 	struct in_addr	imr_address;		/* local IP address of interface */
 	int		imr_ifindex;		/* Interface index */
@@ -149,21 +147,18 @@ struct ip_msfilter {
 	(sizeof(struct ip_msfilter) - sizeof(__u32) \
 	+ (numsrc) * sizeof(__u32))
 
-struct group_req
-{
+struct group_req {
 	__u32				 gr_interface;	/* interface index */
 	struct __kernel_sockaddr_storage gr_group;	/* group address */
 };
 
-struct group_source_req
-{
+struct group_source_req {
 	__u32				 gsr_interface;	/* interface index */
 	struct __kernel_sockaddr_storage gsr_group;	/* group address */
 	struct __kernel_sockaddr_storage gsr_source;	/* source address */
 };
 
-struct group_filter
-{
+struct group_filter {
 	__u32				 gf_interface;	/* interface index */
 	struct __kernel_sockaddr_storage gf_group;	/* multicast address */
 	__u32				 gf_fmode;	/* filter mode */
@@ -175,8 +170,7 @@ struct group_filter
 	(sizeof(struct group_filter) - sizeof(struct __kernel_sockaddr_storage) \
 	+ (numsrc) * sizeof(struct __kernel_sockaddr_storage))
 
-struct in_pktinfo
-{
+struct in_pktinfo {
 	int		ipi_ifindex;
 	struct in_addr	ipi_spec_dst;
 	struct in_addr	ipi_addr;
diff --git a/include/linux/in6.h b/include/linux/in6.h
index 718bf21c5754..dfa29168e6ab 100644
--- a/include/linux/in6.h
+++ b/include/linux/in6.h
@@ -27,10 +27,8 @@
  *	IPv6 address structure
  */
 
-struct in6_addr
-{
-	union 
-	{
+struct in6_addr {
+	union {
 		__u8		u6_addr8[16];
 		__be16		u6_addr16[8];
 		__be32		u6_addr32[4];
@@ -75,8 +73,7 @@ struct ipv6_mreq {
 
 #define ipv6mr_acaddr	ipv6mr_multiaddr
 
-struct in6_flowlabel_req
-{
+struct in6_flowlabel_req {
 	struct in6_addr	flr_dst;
 	__be32	flr_label;
 	__u8	flr_action;
diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index ad27c7da8798..eecfa559bfb4 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -10,15 +10,13 @@
 #include <linux/timer.h>
 #include <linux/sysctl.h>
 
-struct ipv4_devconf
-{
+struct ipv4_devconf {
 	void	*sysctl;
 	int	data[__NET_IPV4_CONF_MAX - 1];
 	DECLARE_BITMAP(state, __NET_IPV4_CONF_MAX - 1);
 };
 
-struct in_device
-{
+struct in_device {
 	struct net_device	*dev;
 	atomic_t		refcnt;
 	int			dead;
@@ -110,8 +108,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
 #define IN_DEV_ARP_IGNORE(in_dev)	IN_DEV_MAXCONF((in_dev), ARP_IGNORE)
 #define IN_DEV_ARP_NOTIFY(in_dev)	IN_DEV_MAXCONF((in_dev), ARP_NOTIFY)
 
-struct in_ifaddr
-{
+struct in_ifaddr {
 	struct in_ifaddr	*ifa_next;
 	struct in_device	*ifa_dev;
 	struct rcu_head		rcu_head;
diff --git a/include/linux/ip_vs.h b/include/linux/ip_vs.h
index 148265e63e8d..dfc170362842 100644
--- a/include/linux/ip_vs.h
+++ b/include/linux/ip_vs.h
@@ -127,8 +127,7 @@ struct ip_vs_dest_user {
 /*
  *	IPVS statistics object (for user space)
  */
-struct ip_vs_stats_user
-{
+struct ip_vs_stats_user {
 	__u32                   conns;          /* connections scheduled */
 	__u32                   inpkts;         /* incoming packets */
 	__u32                   outpkts;        /* outgoing packets */
diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index d5f69151f692..c5f3d53548e2 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -76,8 +76,7 @@ struct vifctl {
  *	Cache manipulation structures for mrouted and PIMd
  */
  
-struct mfcctl
-{
+struct mfcctl {
 	struct in_addr mfcc_origin;		/* Origin of mcast	*/
 	struct in_addr mfcc_mcastgrp;		/* Group in question	*/
 	vifi_t	mfcc_parent;			/* Where it arrived	*/
@@ -92,8 +91,7 @@ struct mfcctl
  *	Group count retrieval for mrouted
  */
  
-struct sioc_sg_req
-{
+struct sioc_sg_req {
 	struct in_addr src;
 	struct in_addr grp;
 	unsigned long pktcnt;
@@ -105,8 +103,7 @@ struct sioc_sg_req
  *	To get vif packet counts
  */
 
-struct sioc_vif_req
-{
+struct sioc_vif_req {
 	vifi_t	vifi;		/* Which iface */
 	unsigned long icount;	/* In packets */
 	unsigned long ocount;	/* Out packets */
@@ -119,8 +116,7 @@ struct sioc_vif_req
  *	data. Magically happens to be like an IP packet as per the original
  */
  
-struct igmpmsg
-{
+struct igmpmsg {
 	__u32 unused1,unused2;
 	unsigned char im_msgtype;		/* What is this */
 	unsigned char im_mbz;			/* Must be zero */
@@ -181,8 +177,7 @@ static inline int ip_mr_init(void)
 }
 #endif
 
-struct vif_device
-{
+struct vif_device {
 	struct net_device 	*dev;			/* Device we are using */
 	unsigned long	bytes_in,bytes_out;
 	unsigned long	pkt_in,pkt_out;		/* Statistics 			*/
@@ -195,8 +190,7 @@ struct vif_device
 
 #define VIFF_STATIC 0x8000
 
-struct mfc_cache 
-{
+struct mfc_cache {
 	struct mfc_cache *next;			/* Next entry on cache line 	*/
 #ifdef CONFIG_NET_NS
 	struct net *mfc_net;
diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h
index b191865a6ca3..2caa1a8e525d 100644
--- a/include/linux/mroute6.h
+++ b/include/linux/mroute6.h
@@ -75,8 +75,7 @@ struct mif6ctl {
  *	Cache manipulation structures for mrouted and PIMd
  */
 
-struct mf6cctl
-{
+struct mf6cctl {
 	struct sockaddr_in6 mf6cc_origin;		/* Origin of mcast	*/
 	struct sockaddr_in6 mf6cc_mcastgrp;		/* Group in question	*/
 	mifi_t	mf6cc_parent;			/* Where it arrived	*/
@@ -87,8 +86,7 @@ struct mf6cctl
  *	Group count retrieval for pim6sd
  */
 
-struct sioc_sg_req6
-{
+struct sioc_sg_req6 {
 	struct sockaddr_in6 src;
 	struct sockaddr_in6 grp;
 	unsigned long pktcnt;
@@ -100,8 +98,7 @@ struct sioc_sg_req6
  *	To get vif packet counts
  */
 
-struct sioc_mif_req6
-{
+struct sioc_mif_req6 {
 	mifi_t	mifi;		/* Which iface */
 	unsigned long icount;	/* In packets */
 	unsigned long ocount;	/* Out packets */
@@ -172,8 +169,7 @@ static inline void ip6_mr_cleanup(void)
 }
 #endif
 
-struct mif_device
-{
+struct mif_device {
 	struct net_device 	*dev;			/* Device we are using */
 	unsigned long	bytes_in,bytes_out;
 	unsigned long	pkt_in,pkt_out;		/* Statistics 			*/
@@ -185,8 +181,7 @@ struct mif_device
 
 #define VIFF_STATIC 0x8000
 
-struct mfc6_cache
-{
+struct mfc6_cache {
 	struct mfc6_cache *next;		/* Next entry on cache line 	*/
 #ifdef CONFIG_NET_NS
 	struct net *mfc6_net;
diff --git a/include/linux/neighbour.h b/include/linux/neighbour.h
index 12c9de138451..a7003b7a695d 100644
--- a/include/linux/neighbour.h
+++ b/include/linux/neighbour.h
@@ -4,8 +4,7 @@
 #include <linux/types.h>
 #include <linux/netlink.h>
 
-struct ndmsg
-{
+struct ndmsg {
 	__u8		ndm_family;
 	__u8		ndm_pad1;
 	__u16		ndm_pad2;
@@ -15,8 +14,7 @@ struct ndmsg
 	__u8		ndm_type;
 };
 
-enum
-{
+enum {
 	NDA_UNSPEC,
 	NDA_DST,
 	NDA_LLADDR,
@@ -56,8 +54,7 @@ enum
    NUD_PERMANENT is also cannot be deleted by garbage collectors.
  */
 
-struct nda_cacheinfo
-{
+struct nda_cacheinfo {
 	__u32		ndm_confirmed;
 	__u32		ndm_used;
 	__u32		ndm_updated;
@@ -89,8 +86,7 @@ struct nda_cacheinfo
  * device.
  ****/
 
-struct ndt_stats
-{
+struct ndt_stats {
 	__u64		ndts_allocs;
 	__u64		ndts_destroys;
 	__u64		ndts_hash_grows;
@@ -124,15 +120,13 @@ enum {
 };
 #define NDTPA_MAX (__NDTPA_MAX - 1)
 
-struct ndtmsg
-{
+struct ndtmsg {
 	__u8		ndtm_family;
 	__u8		ndtm_pad1;
 	__u16		ndtm_pad2;
 };
 
-struct ndt_config
-{
+struct ndt_config {
 	__u16		ndtc_key_len;
 	__u16		ndtc_entry_size;
 	__u32		ndtc_entries;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5077de028317..465add6c43e3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -125,8 +125,7 @@ typedef enum netdev_tx netdev_tx_t;
  *	with byte counters.
  */
 
-struct net_device_stats
-{
+struct net_device_stats {
 	unsigned long	rx_packets;		/* total packets received	*/
 	unsigned long	tx_packets;		/* total packets transmitted	*/
 	unsigned long	rx_bytes;		/* total bytes received 	*/
@@ -179,8 +178,7 @@ struct neighbour;
 struct neigh_parms;
 struct sk_buff;
 
-struct netif_rx_stats
-{
+struct netif_rx_stats {
 	unsigned total;
 	unsigned dropped;
 	unsigned time_squeeze;
@@ -189,8 +187,7 @@ struct netif_rx_stats
 
 DECLARE_PER_CPU(struct netif_rx_stats, netdev_rx_stat);
 
-struct dev_addr_list
-{
+struct dev_addr_list {
 	struct dev_addr_list	*next;
 	u8			da_addr[MAX_ADDR_LEN];
 	u8			da_addrlen;
@@ -227,8 +224,7 @@ struct netdev_hw_addr_list {
 	int			count;
 };
 
-struct hh_cache
-{
+struct hh_cache {
 	struct hh_cache *hh_next;	/* Next entry			     */
 	atomic_t	hh_refcnt;	/* number of users                   */
 /*
@@ -291,8 +287,7 @@ struct header_ops {
  * code.
  */
 
-enum netdev_state_t
-{
+enum netdev_state_t {
 	__LINK_STATE_START,
 	__LINK_STATE_PRESENT,
 	__LINK_STATE_NOCARRIER,
@@ -341,8 +336,7 @@ struct napi_struct {
 	struct sk_buff		*skb;
 };
 
-enum
-{
+enum {
 	NAPI_STATE_SCHED,	/* Poll is scheduled */
 	NAPI_STATE_DISABLE,	/* Disable pending */
 	NAPI_STATE_NPSVC,	/* Netpoll - don't dequeue from poll_list */
@@ -458,8 +452,7 @@ static inline void napi_synchronize(const struct napi_struct *n)
 # define napi_synchronize(n)	barrier()
 #endif
 
-enum netdev_queue_state_t
-{
+enum netdev_queue_state_t {
 	__QUEUE_STATE_XOFF,
 	__QUEUE_STATE_FROZEN,
 };
@@ -653,8 +646,7 @@ struct net_device_ops {
  *	moves out.
  */
 
-struct net_device
-{
+struct net_device {
 
 	/*
 	 * This is the first field of the "visible" part of this structure
@@ -1229,8 +1221,7 @@ static inline int unregister_gifconf(unsigned int family)
  * Incoming packets are placed on per-cpu queues so that
  * no locking is needed.
  */
-struct softnet_data
-{
+struct softnet_data {
 	struct Qdisc		*output_queue;
 	struct sk_buff_head	input_pkt_queue;
 	struct list_head	poll_list;
@@ -1627,7 +1618,8 @@ static inline int netif_dormant(const struct net_device *dev)
  *
  * Check if carrier is operational
  */
-static inline int netif_oper_up(const struct net_device *dev) {
+static inline int netif_oper_up(const struct net_device *dev)
+{
 	return (dev->operstate == IF_OPER_UP ||
 		dev->operstate == IF_OPER_UNKNOWN /* backward compat */);
 }
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 6132b5e6d9d3..48c54960773c 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -93,8 +93,7 @@ typedef unsigned int nf_hookfn(unsigned int hooknum,
 			       const struct net_device *out,
 			       int (*okfn)(struct sk_buff *));
 
-struct nf_hook_ops
-{
+struct nf_hook_ops {
 	struct list_head list;
 
 	/* User fills in from here down. */
@@ -106,8 +105,7 @@ struct nf_hook_ops
 	int priority;
 };
 
-struct nf_sockopt_ops
-{
+struct nf_sockopt_ops {
 	struct list_head list;
 
 	u_int8_t pf;
diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h
index a8248ee422b7..a374787ed9b0 100644
--- a/include/linux/netfilter/nf_conntrack_common.h
+++ b/include/linux/netfilter/nf_conntrack_common.h
@@ -3,8 +3,7 @@
 /* Connection state tracking for netfilter.  This is separated from,
    but required by, the NAT layer; it can also be used by an iptables
    extension. */
-enum ip_conntrack_info
-{
+enum ip_conntrack_info {
 	/* Part of an established connection (either direction). */
 	IP_CT_ESTABLISHED,
 
@@ -76,8 +75,7 @@ enum ip_conntrack_status {
 };
 
 #ifdef __KERNEL__
-struct ip_conntrack_stat
-{
+struct ip_conntrack_stat {
 	unsigned int searched;
 	unsigned int found;
 	unsigned int new;
diff --git a/include/linux/netfilter/nf_conntrack_ftp.h b/include/linux/netfilter/nf_conntrack_ftp.h
index 47727d7546ea..3e3aa08980c3 100644
--- a/include/linux/netfilter/nf_conntrack_ftp.h
+++ b/include/linux/netfilter/nf_conntrack_ftp.h
@@ -3,8 +3,7 @@
 /* FTP tracking. */
 
 /* This enum is exposed to userspace */
-enum nf_ct_ftp_type
-{
+enum nf_ct_ftp_type {
 	/* PORT command from client */
 	NF_CT_FTP_PORT,
 	/* PASV response from server */
diff --git a/include/linux/netfilter/nf_conntrack_sctp.h b/include/linux/netfilter/nf_conntrack_sctp.h
index 768f78c4ac53..ceeefe6681b5 100644
--- a/include/linux/netfilter/nf_conntrack_sctp.h
+++ b/include/linux/netfilter/nf_conntrack_sctp.h
@@ -16,8 +16,7 @@ enum sctp_conntrack {
 	SCTP_CONNTRACK_MAX
 };
 
-struct ip_ct_sctp
-{
+struct ip_ct_sctp {
 	enum sctp_conntrack state;
 
 	__be32 vtag[IP_CT_DIR_MAX];
diff --git a/include/linux/netfilter/nf_conntrack_tcp.h b/include/linux/netfilter/nf_conntrack_tcp.h
index 4352feed2377..f6d97f64d7a0 100644
--- a/include/linux/netfilter/nf_conntrack_tcp.h
+++ b/include/linux/netfilter/nf_conntrack_tcp.h
@@ -55,8 +55,7 @@ struct ip_ct_tcp_state {
 	u_int8_t	flags;		/* per direction options */
 };
 
-struct ip_ct_tcp
-{
+struct ip_ct_tcp {
 	struct ip_ct_tcp_state seen[2];	/* connection parameters per direction */
 	u_int8_t	state;		/* state of the connection (enum tcp_conntrack) */
 	/* For detecting stale connections */
diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 9f00da287f2c..49d321f3ccd2 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -55,8 +55,7 @@ struct nfgenmsg {
 #include <linux/capability.h>
 #include <net/netlink.h>
 
-struct nfnl_callback
-{
+struct nfnl_callback {
 	int (*call)(struct sock *nl, struct sk_buff *skb, 
 		    const struct nlmsghdr *nlh,
 		    const struct nlattr * const cda[]);
@@ -64,8 +63,7 @@ struct nfnl_callback
 	const u_int16_t attr_count;		/* number of nlattr's */
 };
 
-struct nfnetlink_subsystem
-{
+struct nfnetlink_subsystem {
 	const char *name;
 	__u8 subsys_id;			/* nfnetlink subsystem ID */
 	__u8 cb_count;			/* number of callbacks */
diff --git a/include/linux/netfilter/nfnetlink_compat.h b/include/linux/netfilter/nfnetlink_compat.h
index eda55cabceec..ffb95036bbd4 100644
--- a/include/linux/netfilter/nfnetlink_compat.h
+++ b/include/linux/netfilter/nfnetlink_compat.h
@@ -21,8 +21,7 @@
  * ! nfnetlink use the same attributes methods. - J. Schulist.
  */
 
-struct nfattr
-{
+struct nfattr {
 	__u16 nfa_len;
 	__u16 nfa_type;	/* we use 15 bits for the type, and the highest
 				 * bit to indicate whether the payload is nested */
diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 812cb153cabb..378f27ae7772 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -6,8 +6,7 @@
 #define XT_FUNCTION_MAXNAMELEN 30
 #define XT_TABLE_MAXNAMELEN 32
 
-struct xt_entry_match
-{
+struct xt_entry_match {
 	union {
 		struct {
 			__u16 match_size;
@@ -31,8 +30,7 @@ struct xt_entry_match
 	unsigned char data[0];
 };
 
-struct xt_entry_target
-{
+struct xt_entry_target {
 	union {
 		struct {
 			__u16 target_size;
@@ -64,16 +62,14 @@ struct xt_entry_target
 	},								       \
 }
 
-struct xt_standard_target
-{
+struct xt_standard_target {
 	struct xt_entry_target target;
 	int verdict;
 };
 
 /* The argument to IPT_SO_GET_REVISION_*.  Returns highest revision
  * kernel supports, if >= revision. */
-struct xt_get_revision
-{
+struct xt_get_revision {
 	char name[XT_FUNCTION_MAXNAMELEN-1];
 
 	__u8 revision;
@@ -90,8 +86,7 @@ struct xt_get_revision
  * ip6t_entry and arpt_entry.  This sucks, and it is a hack.  It will be my
  * personal pleasure to remove it -HW
  */
-struct _xt_align
-{
+struct _xt_align {
 	__u8 u8;
 	__u16 u16;
 	__u32 u32;
@@ -109,14 +104,12 @@ struct _xt_align
 #define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
 #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
 
-struct xt_counters
-{
+struct xt_counters {
 	__u64 pcnt, bcnt;			/* Packet and byte counters */
 };
 
 /* The argument to IPT_SO_ADD_COUNTERS. */
-struct xt_counters_info
-{
+struct xt_counters_info {
 	/* Which table. */
 	char name[XT_TABLE_MAXNAMELEN];
 
@@ -269,8 +262,7 @@ struct xt_tgdtor_param {
 	u_int8_t family;
 };
 
-struct xt_match
-{
+struct xt_match {
 	struct list_head list;
 
 	const char name[XT_FUNCTION_MAXNAMELEN-1];
@@ -310,8 +302,7 @@ struct xt_match
 };
 
 /* Registration hooks for targets. */
-struct xt_target
-{
+struct xt_target {
 	struct list_head list;
 
 	const char name[XT_FUNCTION_MAXNAMELEN-1];
@@ -349,8 +340,7 @@ struct xt_target
 };
 
 /* Furniture shopping... */
-struct xt_table
-{
+struct xt_table {
 	struct list_head list;
 
 	/* What hooks you will enter on */
@@ -371,8 +361,7 @@ struct xt_table
 #include <linux/netfilter_ipv4.h>
 
 /* The table itself */
-struct xt_table_info
-{
+struct xt_table_info {
 	/* Size per table */
 	unsigned int size;
 	/* Number of entries: FIXME. --RR */
@@ -528,8 +517,7 @@ static inline unsigned long ifname_compare_aligned(const char *_a,
 #ifdef CONFIG_COMPAT
 #include <net/compat.h>
 
-struct compat_xt_entry_match
-{
+struct compat_xt_entry_match {
 	union {
 		struct {
 			u_int16_t match_size;
@@ -545,8 +533,7 @@ struct compat_xt_entry_match
 	unsigned char data[0];
 };
 
-struct compat_xt_entry_target
-{
+struct compat_xt_entry_target {
 	union {
 		struct {
 			u_int16_t target_size;
@@ -566,8 +553,7 @@ struct compat_xt_entry_target
  * need to change whole approach in order to calculate align as function of
  * current task alignment */
 
-struct compat_xt_counters
-{
+struct compat_xt_counters {
 #if defined(CONFIG_X86_64) || defined(CONFIG_IA64)
 	u_int32_t cnt[4];
 #else
@@ -575,8 +561,7 @@ struct compat_xt_counters
 #endif
 };
 
-struct compat_xt_counters_info
-{
+struct compat_xt_counters_info {
 	char name[XT_TABLE_MAXNAMELEN];
 	compat_uint_t num_counters;
 	struct compat_xt_counters counters[0];
diff --git a/include/linux/netfilter/xt_connbytes.h b/include/linux/netfilter/xt_connbytes.h
index 52bd6153b996..92fcbb0d193e 100644
--- a/include/linux/netfilter/xt_connbytes.h
+++ b/include/linux/netfilter/xt_connbytes.h
@@ -15,8 +15,7 @@ enum xt_connbytes_direction {
 	XT_CONNBYTES_DIR_BOTH,
 };
 
-struct xt_connbytes_info
-{
+struct xt_connbytes_info {
 	struct {
 		aligned_u64 from;	/* count to be matched */
 		aligned_u64 to;		/* count to be matched */
diff --git a/include/linux/netfilter/xt_esp.h b/include/linux/netfilter/xt_esp.h
index ef6fa4747d0a..ee6882408000 100644
--- a/include/linux/netfilter/xt_esp.h
+++ b/include/linux/netfilter/xt_esp.h
@@ -3,8 +3,7 @@
 
 #include <linux/types.h>
 
-struct xt_esp
-{
+struct xt_esp {
 	__u32 spis[2];	/* Security Parameter Index */
 	__u8  invflags;	/* Inverse flags */
 };
diff --git a/include/linux/netfilter/xt_multiport.h b/include/linux/netfilter/xt_multiport.h
index 185db499fcbc..5b7e72dfffc5 100644
--- a/include/linux/netfilter/xt_multiport.h
+++ b/include/linux/netfilter/xt_multiport.h
@@ -3,8 +3,7 @@
 
 #include <linux/types.h>
 
-enum xt_multiport_flags
-{
+enum xt_multiport_flags {
 	XT_MULTIPORT_SOURCE,
 	XT_MULTIPORT_DESTINATION,
 	XT_MULTIPORT_EITHER
@@ -13,15 +12,13 @@ enum xt_multiport_flags
 #define XT_MULTI_PORTS	15
 
 /* Must fit inside union xt_matchinfo: 16 bytes */
-struct xt_multiport
-{
+struct xt_multiport {
 	__u8 flags;				/* Type of comparison */
 	__u8 count;				/* Number of ports */
 	__u16 ports[XT_MULTI_PORTS];	/* Ports */
 };
 
-struct xt_multiport_v1
-{
+struct xt_multiport_v1 {
 	__u8 flags;				/* Type of comparison */
 	__u8 count;				/* Number of ports */
 	__u16 ports[XT_MULTI_PORTS];	/* Ports */
diff --git a/include/linux/netfilter/xt_policy.h b/include/linux/netfilter/xt_policy.h
index 7bb64e7c853d..be8ead05c316 100644
--- a/include/linux/netfilter/xt_policy.h
+++ b/include/linux/netfilter/xt_policy.h
@@ -5,22 +5,19 @@
 
 #define XT_POLICY_MAX_ELEM	4
 
-enum xt_policy_flags
-{
+enum xt_policy_flags {
 	XT_POLICY_MATCH_IN	= 0x1,
 	XT_POLICY_MATCH_OUT	= 0x2,
 	XT_POLICY_MATCH_NONE	= 0x4,
 	XT_POLICY_MATCH_STRICT	= 0x8,
 };
 
-enum xt_policy_modes
-{
+enum xt_policy_modes {
 	XT_POLICY_MODE_TRANSPORT,
 	XT_POLICY_MODE_TUNNEL
 };
 
-struct xt_policy_spec
-{
+struct xt_policy_spec {
 	__u8	saddr:1,
 			daddr:1,
 			proto:1,
@@ -30,15 +27,13 @@ struct xt_policy_spec
 };
 
 #ifndef __KERNEL__
-union xt_policy_addr
-{
+union xt_policy_addr {
 	struct in_addr	a4;
 	struct in6_addr	a6;
 };
 #endif
 
-struct xt_policy_elem
-{
+struct xt_policy_elem {
 	union {
 #ifdef __KERNEL__
 		struct {
@@ -65,8 +60,7 @@ struct xt_policy_elem
 	struct xt_policy_spec	invert;
 };
 
-struct xt_policy_info
-{
+struct xt_policy_info {
 	struct xt_policy_elem pol[XT_POLICY_MAX_ELEM];
 	__u16 flags;
 	__u16 len;
diff --git a/include/linux/netfilter/xt_state.h b/include/linux/netfilter/xt_state.h
index c06f32edee07..7b32de886613 100644
--- a/include/linux/netfilter/xt_state.h
+++ b/include/linux/netfilter/xt_state.h
@@ -6,8 +6,7 @@
 
 #define XT_STATE_UNTRACKED (1 << (IP_CT_NUMBER + 1))
 
-struct xt_state_info
-{
+struct xt_state_info {
 	unsigned int statemask;
 };
 #endif /*_XT_STATE_H*/
diff --git a/include/linux/netfilter/xt_string.h b/include/linux/netfilter/xt_string.h
index ecbb95fc89ed..235347c02eab 100644
--- a/include/linux/netfilter/xt_string.h
+++ b/include/linux/netfilter/xt_string.h
@@ -11,8 +11,7 @@ enum {
 	XT_STRING_FLAG_IGNORECASE	= 0x02
 };
 
-struct xt_string_info
-{
+struct xt_string_info {
 	__u16 from_offset;
 	__u16 to_offset;
 	char	  algo[XT_STRING_MAX_ALGO_NAME_SIZE];
diff --git a/include/linux/netfilter/xt_tcpudp.h b/include/linux/netfilter/xt_tcpudp.h
index a490a0bc1d29..38aa7b399021 100644
--- a/include/linux/netfilter/xt_tcpudp.h
+++ b/include/linux/netfilter/xt_tcpudp.h
@@ -4,8 +4,7 @@
 #include <linux/types.h>
 
 /* TCP matching stuff */
-struct xt_tcp
-{
+struct xt_tcp {
 	__u16 spts[2];			/* Source port range. */
 	__u16 dpts[2];			/* Destination port range. */
 	__u8 option;			/* TCP Option iff non-zero*/
@@ -22,8 +21,7 @@ struct xt_tcp
 #define XT_TCP_INV_MASK		0x0F	/* All possible flags. */
 
 /* UDP matching stuff */
-struct xt_udp
-{
+struct xt_udp {
 	__u16 spts[2];			/* Source port range. */
 	__u16 dpts[2];			/* Destination port range. */
 	__u8 invflags;			/* Inverse flags */
diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h
index 6fe3e6aa10db..f2336523a9df 100644
--- a/include/linux/netfilter_arp/arp_tables.h
+++ b/include/linux/netfilter_arp/arp_tables.h
@@ -132,8 +132,7 @@ struct arpt_entry
 #define ARPT_RETURN XT_RETURN
 
 /* The argument to ARPT_SO_GET_INFO */
-struct arpt_getinfo
-{
+struct arpt_getinfo {
 	/* Which table: caller fills this in. */
 	char name[ARPT_TABLE_MAXNAMELEN];
 
@@ -155,8 +154,7 @@ struct arpt_getinfo
 };
 
 /* The argument to ARPT_SO_SET_REPLACE. */
-struct arpt_replace
-{
+struct arpt_replace {
 	/* Which table. */
 	char name[ARPT_TABLE_MAXNAMELEN];
 
@@ -191,8 +189,7 @@ struct arpt_replace
 #define arpt_counters xt_counters
 
 /* The argument to ARPT_SO_GET_ENTRIES. */
-struct arpt_get_entries
-{
+struct arpt_get_entries {
 	/* Which table: user fills this in. */
 	char name[ARPT_TABLE_MAXNAMELEN];
 
@@ -224,20 +221,17 @@ static __inline__ struct arpt_entry_target *arpt_get_target(struct arpt_entry *e
 #ifdef __KERNEL__
 
 /* Standard entry. */
-struct arpt_standard
-{
+struct arpt_standard {
 	struct arpt_entry entry;
 	struct arpt_standard_target target;
 };
 
-struct arpt_error_target
-{
+struct arpt_error_target {
 	struct arpt_entry_target target;
 	char errorname[ARPT_FUNCTION_MAXNAMELEN];
 };
 
-struct arpt_error
-{
+struct arpt_error {
 	struct arpt_entry entry;
 	struct arpt_error_target target;
 };
@@ -279,8 +273,7 @@ extern unsigned int arpt_do_table(struct sk_buff *skb,
 #ifdef CONFIG_COMPAT
 #include <net/compat.h>
 
-struct compat_arpt_entry
-{
+struct compat_arpt_entry {
 	struct arpt_arp arp;
 	u_int16_t target_offset;
 	u_int16_t next_offset;
diff --git a/include/linux/netfilter_bridge/ebt_802_3.h b/include/linux/netfilter_bridge/ebt_802_3.h
index a11b0c2017fd..c73ef0b18bdc 100644
--- a/include/linux/netfilter_bridge/ebt_802_3.h
+++ b/include/linux/netfilter_bridge/ebt_802_3.h
@@ -58,8 +58,7 @@ static inline struct ebt_802_3_hdr *ebt_802_3_hdr(const struct sk_buff *skb)
 }
 #endif
 
-struct ebt_802_3_info 
-{
+struct ebt_802_3_info {
 	uint8_t  sap;
 	__be16 type;
 	uint8_t  bitmask;
diff --git a/include/linux/netfilter_bridge/ebt_among.h b/include/linux/netfilter_bridge/ebt_among.h
index 7654069233ca..0009558609a7 100644
--- a/include/linux/netfilter_bridge/ebt_among.h
+++ b/include/linux/netfilter_bridge/ebt_among.h
@@ -29,14 +29,12 @@
  * Yes, it is a memory overhead, but in 2003 AD, who cares?
  */
 
-struct ebt_mac_wormhash_tuple
-{
+struct ebt_mac_wormhash_tuple {
 	uint32_t cmp[2];
 	__be32 ip;
 };
 
-struct ebt_mac_wormhash
-{
+struct ebt_mac_wormhash {
 	int table[257];
 	int poolsize;
 	struct ebt_mac_wormhash_tuple pool[0];
@@ -45,8 +43,7 @@ struct ebt_mac_wormhash
 #define ebt_mac_wormhash_size(x) ((x) ? sizeof(struct ebt_mac_wormhash) \
 		+ (x)->poolsize * sizeof(struct ebt_mac_wormhash_tuple) : 0)
 
-struct ebt_among_info
-{
+struct ebt_among_info {
 	int wh_dst_ofs;
 	int wh_src_ofs;
 	int bitmask;
diff --git a/include/linux/netfilter_bridge/ebt_arpreply.h b/include/linux/netfilter_bridge/ebt_arpreply.h
index 96a8339960e0..7e77896e1fbf 100644
--- a/include/linux/netfilter_bridge/ebt_arpreply.h
+++ b/include/linux/netfilter_bridge/ebt_arpreply.h
@@ -1,8 +1,7 @@
 #ifndef __LINUX_BRIDGE_EBT_ARPREPLY_H
 #define __LINUX_BRIDGE_EBT_ARPREPLY_H
 
-struct ebt_arpreply_info
-{
+struct ebt_arpreply_info {
 	unsigned char mac[ETH_ALEN];
 	int target;
 };
diff --git a/include/linux/netfilter_bridge/ebt_ip.h b/include/linux/netfilter_bridge/ebt_ip.h
index d6847475bf2e..6a708fb92241 100644
--- a/include/linux/netfilter_bridge/ebt_ip.h
+++ b/include/linux/netfilter_bridge/ebt_ip.h
@@ -26,8 +26,7 @@
 #define EBT_IP_MATCH "ip"
 
 /* the same values are used for the invflags */
-struct ebt_ip_info
-{
+struct ebt_ip_info {
 	__be32 saddr;
 	__be32 daddr;
 	__be32 smsk;
diff --git a/include/linux/netfilter_bridge/ebt_ip6.h b/include/linux/netfilter_bridge/ebt_ip6.h
index 2273c3ae33ca..e5de98701519 100644
--- a/include/linux/netfilter_bridge/ebt_ip6.h
+++ b/include/linux/netfilter_bridge/ebt_ip6.h
@@ -23,8 +23,7 @@
 #define EBT_IP6_MATCH "ip6"
 
 /* the same values are used for the invflags */
-struct ebt_ip6_info
-{
+struct ebt_ip6_info {
 	struct in6_addr saddr;
 	struct in6_addr daddr;
 	struct in6_addr smsk;
diff --git a/include/linux/netfilter_bridge/ebt_limit.h b/include/linux/netfilter_bridge/ebt_limit.h
index d8b65000afe4..4bf76b751676 100644
--- a/include/linux/netfilter_bridge/ebt_limit.h
+++ b/include/linux/netfilter_bridge/ebt_limit.h
@@ -9,8 +9,7 @@
 /* 1/10,000 sec period => max of 10,000/sec.  Min rate is then 429490
    seconds, or one every 59 hours. */
 
-struct ebt_limit_info
-{
+struct ebt_limit_info {
 	u_int32_t avg;    /* Average secs between packets * scale */
 	u_int32_t burst;  /* Period multiplier for upper limit. */
 
diff --git a/include/linux/netfilter_bridge/ebt_log.h b/include/linux/netfilter_bridge/ebt_log.h
index b76e653157e5..cc2cdfb764bc 100644
--- a/include/linux/netfilter_bridge/ebt_log.h
+++ b/include/linux/netfilter_bridge/ebt_log.h
@@ -9,8 +9,7 @@
 #define EBT_LOG_PREFIX_SIZE 30
 #define EBT_LOG_WATCHER "log"
 
-struct ebt_log_info
-{
+struct ebt_log_info {
 	uint8_t loglevel;
 	uint8_t prefix[EBT_LOG_PREFIX_SIZE];
 	uint32_t bitmask;
diff --git a/include/linux/netfilter_bridge/ebt_mark_m.h b/include/linux/netfilter_bridge/ebt_mark_m.h
index 301524ff1065..9ceb10ec0ed6 100644
--- a/include/linux/netfilter_bridge/ebt_mark_m.h
+++ b/include/linux/netfilter_bridge/ebt_mark_m.h
@@ -4,8 +4,7 @@
 #define EBT_MARK_AND 0x01
 #define EBT_MARK_OR 0x02
 #define EBT_MARK_MASK (EBT_MARK_AND | EBT_MARK_OR)
-struct ebt_mark_m_info
-{
+struct ebt_mark_m_info {
 	unsigned long mark, mask;
 	uint8_t invert;
 	uint8_t bitmask;
diff --git a/include/linux/netfilter_bridge/ebt_mark_t.h b/include/linux/netfilter_bridge/ebt_mark_t.h
index 6270f6f33693..7d5a268a4311 100644
--- a/include/linux/netfilter_bridge/ebt_mark_t.h
+++ b/include/linux/netfilter_bridge/ebt_mark_t.h
@@ -13,8 +13,7 @@
 #define MARK_AND_VALUE (0xffffffd0)
 #define MARK_XOR_VALUE (0xffffffc0)
 
-struct ebt_mark_t_info
-{
+struct ebt_mark_t_info {
 	unsigned long mark;
 	/* EBT_ACCEPT, EBT_DROP, EBT_CONTINUE or EBT_RETURN */
 	int target;
diff --git a/include/linux/netfilter_bridge/ebt_nat.h b/include/linux/netfilter_bridge/ebt_nat.h
index 435b886a51aa..5e74e3b03bd6 100644
--- a/include/linux/netfilter_bridge/ebt_nat.h
+++ b/include/linux/netfilter_bridge/ebt_nat.h
@@ -2,8 +2,7 @@
 #define __LINUX_BRIDGE_EBT_NAT_H
 
 #define NAT_ARP_BIT  (0x00000010)
-struct ebt_nat_info
-{
+struct ebt_nat_info {
 	unsigned char mac[ETH_ALEN];
 	/* EBT_ACCEPT, EBT_DROP, EBT_CONTINUE or EBT_RETURN */
 	int target;
diff --git a/include/linux/netfilter_bridge/ebt_pkttype.h b/include/linux/netfilter_bridge/ebt_pkttype.h
index 0d64bbb29c66..51a799840931 100644
--- a/include/linux/netfilter_bridge/ebt_pkttype.h
+++ b/include/linux/netfilter_bridge/ebt_pkttype.h
@@ -1,8 +1,7 @@
 #ifndef __LINUX_BRIDGE_EBT_PKTTYPE_H
 #define __LINUX_BRIDGE_EBT_PKTTYPE_H
 
-struct ebt_pkttype_info
-{
+struct ebt_pkttype_info {
 	uint8_t pkt_type;
 	uint8_t invert;
 };
diff --git a/include/linux/netfilter_bridge/ebt_redirect.h b/include/linux/netfilter_bridge/ebt_redirect.h
index 5c67990fce39..dd9622ce8488 100644
--- a/include/linux/netfilter_bridge/ebt_redirect.h
+++ b/include/linux/netfilter_bridge/ebt_redirect.h
@@ -1,8 +1,7 @@
 #ifndef __LINUX_BRIDGE_EBT_REDIRECT_H
 #define __LINUX_BRIDGE_EBT_REDIRECT_H
 
-struct ebt_redirect_info
-{
+struct ebt_redirect_info {
 	/* EBT_ACCEPT, EBT_DROP, EBT_CONTINUE or EBT_RETURN */
 	int target;
 };
diff --git a/include/linux/netfilter_bridge/ebt_stp.h b/include/linux/netfilter_bridge/ebt_stp.h
index e5fd67850f4d..e503a0aa2728 100644
--- a/include/linux/netfilter_bridge/ebt_stp.h
+++ b/include/linux/netfilter_bridge/ebt_stp.h
@@ -20,8 +20,7 @@
 
 #define EBT_STP_MATCH "stp"
 
-struct ebt_stp_config_info
-{
+struct ebt_stp_config_info {
 	uint8_t flags;
 	uint16_t root_priol, root_priou;
 	char root_addr[6], root_addrmsk[6];
@@ -35,8 +34,7 @@ struct ebt_stp_config_info
 	uint16_t forward_delayl, forward_delayu;
 };
 
-struct ebt_stp_info
-{
+struct ebt_stp_info {
 	uint8_t type;
 	struct ebt_stp_config_info config;
 	uint16_t bitmask;
diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h
index ea281e6a2048..3cc40c131cc3 100644
--- a/include/linux/netfilter_bridge/ebtables.h
+++ b/include/linux/netfilter_bridge/ebtables.h
@@ -34,14 +34,12 @@
 struct xt_match;
 struct xt_target;
 
-struct ebt_counter
-{
+struct ebt_counter {
 	uint64_t pcnt;
 	uint64_t bcnt;
 };
 
-struct ebt_replace
-{
+struct ebt_replace {
 	char name[EBT_TABLE_MAXNAMELEN];
 	unsigned int valid_hooks;
 	/* nr of rules in the table */
@@ -57,8 +55,7 @@ struct ebt_replace
 	char __user *entries;
 };
 
-struct ebt_replace_kernel
-{
+struct ebt_replace_kernel {
 	char name[EBT_TABLE_MAXNAMELEN];
 	unsigned int valid_hooks;
 	/* nr of rules in the table */
@@ -120,8 +117,7 @@ struct ebt_entries {
 #define EBT_INV_MASK (EBT_IPROTO | EBT_IIN | EBT_IOUT | EBT_ILOGICALIN \
    | EBT_ILOGICALOUT | EBT_ISOURCE | EBT_IDEST)
 
-struct ebt_entry_match
-{
+struct ebt_entry_match {
 	union {
 		char name[EBT_FUNCTION_MAXNAMELEN];
 		struct xt_match *match;
@@ -131,8 +127,7 @@ struct ebt_entry_match
 	unsigned char data[0] __attribute__ ((aligned (__alignof__(struct ebt_replace))));
 };
 
-struct ebt_entry_watcher
-{
+struct ebt_entry_watcher {
 	union {
 		char name[EBT_FUNCTION_MAXNAMELEN];
 		struct xt_target *watcher;
@@ -142,8 +137,7 @@ struct ebt_entry_watcher
 	unsigned char data[0] __attribute__ ((aligned (__alignof__(struct ebt_replace))));
 };
 
-struct ebt_entry_target
-{
+struct ebt_entry_target {
 	union {
 		char name[EBT_FUNCTION_MAXNAMELEN];
 		struct xt_target *target;
@@ -154,8 +148,7 @@ struct ebt_entry_target
 };
 
 #define EBT_STANDARD_TARGET "standard"
-struct ebt_standard_target
-{
+struct ebt_standard_target {
 	struct ebt_entry_target target;
 	int verdict;
 };
@@ -206,8 +199,7 @@ struct ebt_entry {
 #define EBT_MATCH 0
 #define EBT_NOMATCH 1
 
-struct ebt_match
-{
+struct ebt_match {
 	struct list_head list;
 	const char name[EBT_FUNCTION_MAXNAMELEN];
 	bool (*match)(const struct sk_buff *skb, const struct net_device *in,
@@ -224,8 +216,7 @@ struct ebt_match
 	struct module *me;
 };
 
-struct ebt_watcher
-{
+struct ebt_watcher {
 	struct list_head list;
 	const char name[EBT_FUNCTION_MAXNAMELEN];
 	unsigned int (*target)(struct sk_buff *skb,
@@ -242,8 +233,7 @@ struct ebt_watcher
 	struct module *me;
 };
 
-struct ebt_target
-{
+struct ebt_target {
 	struct list_head list;
 	const char name[EBT_FUNCTION_MAXNAMELEN];
 	/* returns one of the standard EBT_* verdicts */
@@ -262,15 +252,13 @@ struct ebt_target
 };
 
 /* used for jumping from and into user defined chains (udc) */
-struct ebt_chainstack
-{
+struct ebt_chainstack {
 	struct ebt_entries *chaininfo; /* pointer to chain data */
 	struct ebt_entry *e; /* pointer to entry data */
 	unsigned int n; /* n'th entry */
 };
 
-struct ebt_table_info
-{
+struct ebt_table_info {
 	/* total size of the entries */
 	unsigned int entries_size;
 	unsigned int nentries;
@@ -282,8 +270,7 @@ struct ebt_table_info
 	struct ebt_counter counters[0] ____cacheline_aligned;
 };
 
-struct ebt_table
-{
+struct ebt_table {
 	struct list_head list;
 	char name[EBT_TABLE_MAXNAMELEN];
 	struct ebt_replace_kernel *table;
diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h
index 61fafc868a7b..27b3f5807305 100644
--- a/include/linux/netfilter_ipv4/ip_tables.h
+++ b/include/linux/netfilter_ipv4/ip_tables.h
@@ -76,8 +76,7 @@ struct ipt_ip {
 /* This structure defines each of the firewall rules.  Consists of 3
    parts which are 1) general IP header stuff 2) match specific
    stuff 3) the target to perform if the rule matches */
-struct ipt_entry
-{
+struct ipt_entry {
 	struct ipt_ip ip;
 
 	/* Mark with fields that we care about. */
@@ -135,8 +134,7 @@ struct ipt_entry
 #define IPT_UDP_INV_MASK	XT_UDP_INV_MASK
 
 /* ICMP matching stuff */
-struct ipt_icmp
-{
+struct ipt_icmp {
 	u_int8_t type;				/* type to match */
 	u_int8_t code[2];			/* range of code */
 	u_int8_t invflags;			/* Inverse flags */
@@ -146,8 +144,7 @@ struct ipt_icmp
 #define IPT_ICMP_INV	0x01	/* Invert the sense of type/code test */
 
 /* The argument to IPT_SO_GET_INFO */
-struct ipt_getinfo
-{
+struct ipt_getinfo {
 	/* Which table: caller fills this in. */
 	char name[IPT_TABLE_MAXNAMELEN];
 
@@ -169,8 +166,7 @@ struct ipt_getinfo
 };
 
 /* The argument to IPT_SO_SET_REPLACE. */
-struct ipt_replace
-{
+struct ipt_replace {
 	/* Which table. */
 	char name[IPT_TABLE_MAXNAMELEN];
 
@@ -204,8 +200,7 @@ struct ipt_replace
 #define ipt_counters_info xt_counters_info
 
 /* The argument to IPT_SO_GET_ENTRIES. */
-struct ipt_get_entries
-{
+struct ipt_get_entries {
 	/* Which table: user fills this in. */
 	char name[IPT_TABLE_MAXNAMELEN];
 
@@ -250,20 +245,17 @@ extern struct xt_table *ipt_register_table(struct net *net,
 extern void ipt_unregister_table(struct xt_table *table);
 
 /* Standard entry. */
-struct ipt_standard
-{
+struct ipt_standard {
 	struct ipt_entry entry;
 	struct ipt_standard_target target;
 };
 
-struct ipt_error_target
-{
+struct ipt_error_target {
 	struct ipt_entry_target target;
 	char errorname[IPT_FUNCTION_MAXNAMELEN];
 };
 
-struct ipt_error
-{
+struct ipt_error {
 	struct ipt_entry entry;
 	struct ipt_error_target target;
 };
@@ -301,8 +293,7 @@ extern unsigned int ipt_do_table(struct sk_buff *skb,
 #ifdef CONFIG_COMPAT
 #include <net/compat.h>
 
-struct compat_ipt_entry
-{
+struct compat_ipt_entry {
 	struct ipt_ip ip;
 	compat_uint_t nfcache;
 	u_int16_t target_offset;
diff --git a/include/linux/netfilter_ipv4/ipt_SAME.h b/include/linux/netfilter_ipv4/ipt_SAME.h
index be6e682a85ec..2529660c5b38 100644
--- a/include/linux/netfilter_ipv4/ipt_SAME.h
+++ b/include/linux/netfilter_ipv4/ipt_SAME.h
@@ -5,8 +5,7 @@
 
 #define IPT_SAME_NODST		0x01
 
-struct ipt_same_info
-{
+struct ipt_same_info {
 	unsigned char info;
 	u_int32_t rangesize;
 	u_int32_t ipnum;
diff --git a/include/linux/netfilter_ipv4/ipt_ah.h b/include/linux/netfilter_ipv4/ipt_ah.h
index 7b9a2ac7adb9..2e555b4d05e3 100644
--- a/include/linux/netfilter_ipv4/ipt_ah.h
+++ b/include/linux/netfilter_ipv4/ipt_ah.h
@@ -1,8 +1,7 @@
 #ifndef _IPT_AH_H
 #define _IPT_AH_H
 
-struct ipt_ah
-{
+struct ipt_ah {
 	u_int32_t spis[2];			/* Security Parameter Index */
 	u_int8_t  invflags;			/* Inverse flags */
 };
diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h
index a64e1451ac38..b31050d20ae4 100644
--- a/include/linux/netfilter_ipv6/ip6_tables.h
+++ b/include/linux/netfilter_ipv6/ip6_tables.h
@@ -88,8 +88,7 @@ struct ip6t_ip6 {
 /* This structure defines each of the firewall rules.  Consists of 3
    parts which are 1) general IP header stuff 2) match specific
    stuff 3) the target to perform if the rule matches */
-struct ip6t_entry
-{
+struct ip6t_entry {
 	struct ip6t_ip6 ipv6;
 
 	/* Mark with fields that we care about. */
@@ -111,20 +110,17 @@ struct ip6t_entry
 };
 
 /* Standard entry */
-struct ip6t_standard
-{
+struct ip6t_standard {
 	struct ip6t_entry entry;
 	struct ip6t_standard_target target;
 };
 
-struct ip6t_error_target
-{
+struct ip6t_error_target {
 	struct ip6t_entry_target target;
 	char errorname[IP6T_FUNCTION_MAXNAMELEN];
 };
 
-struct ip6t_error
-{
+struct ip6t_error {
 	struct ip6t_entry entry;
 	struct ip6t_error_target target;
 };
@@ -195,8 +191,7 @@ struct ip6t_error
 #define IP6T_UDP_INV_MASK	XT_UDP_INV_MASK
 
 /* ICMP matching stuff */
-struct ip6t_icmp
-{
+struct ip6t_icmp {
 	u_int8_t type;				/* type to match */
 	u_int8_t code[2];			/* range of code */
 	u_int8_t invflags;			/* Inverse flags */
@@ -206,8 +201,7 @@ struct ip6t_icmp
 #define IP6T_ICMP_INV	0x01	/* Invert the sense of type/code test */
 
 /* The argument to IP6T_SO_GET_INFO */
-struct ip6t_getinfo
-{
+struct ip6t_getinfo {
 	/* Which table: caller fills this in. */
 	char name[IP6T_TABLE_MAXNAMELEN];
 
@@ -229,8 +223,7 @@ struct ip6t_getinfo
 };
 
 /* The argument to IP6T_SO_SET_REPLACE. */
-struct ip6t_replace
-{
+struct ip6t_replace {
 	/* Which table. */
 	char name[IP6T_TABLE_MAXNAMELEN];
 
@@ -264,8 +257,7 @@ struct ip6t_replace
 #define ip6t_counters_info xt_counters_info
 
 /* The argument to IP6T_SO_GET_ENTRIES. */
-struct ip6t_get_entries
-{
+struct ip6t_get_entries {
 	/* Which table: user fills this in. */
 	char name[IP6T_TABLE_MAXNAMELEN];
 
@@ -330,8 +322,7 @@ extern int ip6_masked_addrcmp(const struct in6_addr *addr1,
 #ifdef CONFIG_COMPAT
 #include <net/compat.h>
 
-struct compat_ip6t_entry
-{
+struct compat_ip6t_entry {
 	struct ip6t_ip6 ipv6;
 	compat_uint_t nfcache;
 	u_int16_t target_offset;
diff --git a/include/linux/netfilter_ipv6/ip6t_ah.h b/include/linux/netfilter_ipv6/ip6t_ah.h
index 8531879eb464..17a745cfb2c7 100644
--- a/include/linux/netfilter_ipv6/ip6t_ah.h
+++ b/include/linux/netfilter_ipv6/ip6t_ah.h
@@ -1,8 +1,7 @@
 #ifndef _IP6T_AH_H
 #define _IP6T_AH_H
 
-struct ip6t_ah
-{
+struct ip6t_ah {
 	u_int32_t spis[2];			/* Security Parameter Index */
 	u_int32_t hdrlen;			/* Header Length */
 	u_int8_t  hdrres;			/* Test of the Reserved Filed */
diff --git a/include/linux/netfilter_ipv6/ip6t_frag.h b/include/linux/netfilter_ipv6/ip6t_frag.h
index 66070a0d6dfc..3724d0850920 100644
--- a/include/linux/netfilter_ipv6/ip6t_frag.h
+++ b/include/linux/netfilter_ipv6/ip6t_frag.h
@@ -1,8 +1,7 @@
 #ifndef _IP6T_FRAG_H
 #define _IP6T_FRAG_H
 
-struct ip6t_frag
-{
+struct ip6t_frag {
 	u_int32_t ids[2];			/* Security Parameter Index */
 	u_int32_t hdrlen;			/* Header Length */
 	u_int8_t  flags;			/*  */
diff --git a/include/linux/netfilter_ipv6/ip6t_ipv6header.h b/include/linux/netfilter_ipv6/ip6t_ipv6header.h
index 51c53fc9c44a..01dfd445596a 100644
--- a/include/linux/netfilter_ipv6/ip6t_ipv6header.h
+++ b/include/linux/netfilter_ipv6/ip6t_ipv6header.h
@@ -8,8 +8,7 @@ on whether they contain certain headers */
 #ifndef __IPV6HEADER_H
 #define __IPV6HEADER_H
 
-struct ip6t_ipv6header_info
-{
+struct ip6t_ipv6header_info {
 	u_int8_t matchflags;
 	u_int8_t invflags;
 	u_int8_t modeflag;
diff --git a/include/linux/netfilter_ipv6/ip6t_mh.h b/include/linux/netfilter_ipv6/ip6t_mh.h
index b9ca9a5f74d0..18549bca2d1f 100644
--- a/include/linux/netfilter_ipv6/ip6t_mh.h
+++ b/include/linux/netfilter_ipv6/ip6t_mh.h
@@ -2,8 +2,7 @@
 #define _IP6T_MH_H
 
 /* MH matching stuff */
-struct ip6t_mh
-{
+struct ip6t_mh {
 	u_int8_t types[2];	/* MH type range */
 	u_int8_t invflags;	/* Inverse flags */
 };
diff --git a/include/linux/netfilter_ipv6/ip6t_opts.h b/include/linux/netfilter_ipv6/ip6t_opts.h
index a07e36380ae8..62d89bcd9f9c 100644
--- a/include/linux/netfilter_ipv6/ip6t_opts.h
+++ b/include/linux/netfilter_ipv6/ip6t_opts.h
@@ -3,8 +3,7 @@
 
 #define IP6T_OPTS_OPTSNR 16
 
-struct ip6t_opts
-{
+struct ip6t_opts {
 	u_int32_t hdrlen;			/* Header Length */
 	u_int8_t flags;				/*  */
 	u_int8_t invflags;			/* Inverse flags */
diff --git a/include/linux/netfilter_ipv6/ip6t_rt.h b/include/linux/netfilter_ipv6/ip6t_rt.h
index 52156023e8db..ab91bfd2cd00 100644
--- a/include/linux/netfilter_ipv6/ip6t_rt.h
+++ b/include/linux/netfilter_ipv6/ip6t_rt.h
@@ -5,8 +5,7 @@
 
 #define IP6T_RT_HOPS 16
 
-struct ip6t_rt
-{
+struct ip6t_rt {
 	u_int32_t rt_type;			/* Routing Type */
 	u_int32_t segsleft[2];			/* Segments Left */
 	u_int32_t hdrlen;			/* Header Length */
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index ab5d3126831f..fde27c017326 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -29,16 +29,14 @@
 
 struct net;
 
-struct sockaddr_nl
-{
+struct sockaddr_nl {
 	sa_family_t	nl_family;	/* AF_NETLINK	*/
 	unsigned short	nl_pad;		/* zero		*/
 	__u32		nl_pid;		/* port ID	*/
        	__u32		nl_groups;	/* multicast groups mask */
 };
 
-struct nlmsghdr
-{
+struct nlmsghdr {
 	__u32		nlmsg_len;	/* Length of message including header */
 	__u16		nlmsg_type;	/* Message content */
 	__u16		nlmsg_flags;	/* Additional flags */
@@ -94,8 +92,7 @@ struct nlmsghdr
 
 #define NLMSG_MIN_TYPE		0x10	/* < 0x10: reserved control messages */
 
-struct nlmsgerr
-{
+struct nlmsgerr {
 	int		error;
 	struct nlmsghdr msg;
 };
@@ -106,8 +103,7 @@ struct nlmsgerr
 #define NETLINK_BROADCAST_ERROR	4
 #define NETLINK_NO_ENOBUFS	5
 
-struct nl_pktinfo
-{
+struct nl_pktinfo {
 	__u32	group;
 };
 
@@ -127,8 +123,7 @@ enum {
  *  <-------------- nlattr->nla_len -------------->
  */
 
-struct nlattr
-{
+struct nlattr {
 	__u16           nla_len;
 	__u16           nla_type;
 };
@@ -161,8 +156,7 @@ static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb)
 	return (struct nlmsghdr *)skb->data;
 }
 
-struct netlink_skb_parms
-{
+struct netlink_skb_parms {
 	struct ucred		creds;		/* Skb credentials	*/
 	__u32			pid;
 	__u32			dst_group;
@@ -220,8 +214,7 @@ int netlink_sendskb(struct sock *sk, struct sk_buff *skb);
 #define NLMSG_DEFAULT_SIZE (NLMSG_GOODSIZE - NLMSG_HDRLEN)
 
 
-struct netlink_callback
-{
+struct netlink_callback {
 	struct sk_buff		*skb;
 	const struct nlmsghdr	*nlh;
 	int			(*dump)(struct sk_buff * skb,
@@ -231,8 +224,7 @@ struct netlink_callback
 	long			args[6];
 };
 
-struct netlink_notify
-{
+struct netlink_notify {
 	struct net *net;
 	int pid;
 	int protocol;
diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h
index 3c842edff388..7f6ba8658abe 100644
--- a/include/linux/pkt_cls.h
+++ b/include/linux/pkt_cls.h
@@ -75,8 +75,7 @@ bits 9,10,11: redirect counter -  redirect TTL. Loop avoidance
 #define SET_TC_AT(v,n)   ((V_TC_AT(n)) | (v & ~M_TC_AT))
 
 /* Action attributes */
-enum
-{
+enum {
 	TCA_ACT_UNSPEC,
 	TCA_ACT_KIND,
 	TCA_ACT_OPTIONS,
@@ -108,8 +107,7 @@ enum
 #define TC_ACT_JUMP		0x10000000
 
 /* Action type identifiers*/
-enum
-{
+enum {
 	TCA_ID_UNSPEC=0,
 	TCA_ID_POLICE=1,
 	/* other actions go here */
@@ -118,8 +116,7 @@ enum
 
 #define TCA_ID_MAX __TCA_ID_MAX
 
-struct tc_police
-{
+struct tc_police {
 	__u32			index;
 	int			action;
 #define TC_POLICE_UNSPEC	TC_ACT_UNSPEC
@@ -138,15 +135,13 @@ struct tc_police
 	__u32			capab;
 };
 
-struct tcf_t
-{
+struct tcf_t {
 	__u64   install;
 	__u64   lastuse;
 	__u64   expires;
 };
 
-struct tc_cnt
-{
+struct tc_cnt {
 	int                   refcnt; 
 	int                   bindcnt;
 };
@@ -158,8 +153,7 @@ struct tc_cnt
 	int                   refcnt; \
 	int                   bindcnt
 
-enum
-{
+enum {
 	TCA_POLICE_UNSPEC,
 	TCA_POLICE_TBF,
 	TCA_POLICE_RATE,
@@ -182,8 +176,7 @@ enum
 #define TC_U32_UNSPEC	0
 #define TC_U32_ROOT	(0xFFF00000)
 
-enum
-{
+enum {
 	TCA_U32_UNSPEC,
 	TCA_U32_CLASSID,
 	TCA_U32_HASH,
@@ -200,16 +193,14 @@ enum
 
 #define TCA_U32_MAX (__TCA_U32_MAX - 1)
 
-struct tc_u32_key
-{
+struct tc_u32_key {
 	__be32		mask;
 	__be32		val;
 	int		off;
 	int		offmask;
 };
 
-struct tc_u32_sel
-{
+struct tc_u32_sel {
 	unsigned char		flags;
 	unsigned char		offshift;
 	unsigned char		nkeys;
@@ -223,15 +214,13 @@ struct tc_u32_sel
 	struct tc_u32_key	keys[0];
 };
 
-struct tc_u32_mark
-{
+struct tc_u32_mark {
 	__u32		val;
 	__u32		mask;
 	__u32		success;
 };
 
-struct tc_u32_pcnt
-{
+struct tc_u32_pcnt {
 	__u64 rcnt;
 	__u64 rhit;
 	__u64 kcnts[0];
@@ -249,8 +238,7 @@ struct tc_u32_pcnt
 
 /* RSVP filter */
 
-enum
-{
+enum {
 	TCA_RSVP_UNSPEC,
 	TCA_RSVP_CLASSID,
 	TCA_RSVP_DST,
@@ -263,15 +251,13 @@ enum
 
 #define TCA_RSVP_MAX (__TCA_RSVP_MAX - 1 )
 
-struct tc_rsvp_gpi
-{
+struct tc_rsvp_gpi {
 	__u32	key;
 	__u32	mask;
 	int	offset;
 };
 
-struct tc_rsvp_pinfo
-{
+struct tc_rsvp_pinfo {
 	struct tc_rsvp_gpi dpi;
 	struct tc_rsvp_gpi spi;
 	__u8	protocol;
@@ -282,8 +268,7 @@ struct tc_rsvp_pinfo
 
 /* ROUTE filter */
 
-enum
-{
+enum {
 	TCA_ROUTE4_UNSPEC,
 	TCA_ROUTE4_CLASSID,
 	TCA_ROUTE4_TO,
@@ -299,8 +284,7 @@ enum
 
 /* FW filter */
 
-enum
-{
+enum {
 	TCA_FW_UNSPEC,
 	TCA_FW_CLASSID,
 	TCA_FW_POLICE,
@@ -314,8 +298,7 @@ enum
 
 /* TC index filter */
 
-enum
-{
+enum {
 	TCA_TCINDEX_UNSPEC,
 	TCA_TCINDEX_HASH,
 	TCA_TCINDEX_MASK,
@@ -331,8 +314,7 @@ enum
 
 /* Flow filter */
 
-enum
-{
+enum {
 	FLOW_KEY_SRC,
 	FLOW_KEY_DST,
 	FLOW_KEY_PROTO,
@@ -355,14 +337,12 @@ enum
 
 #define FLOW_KEY_MAX	(__FLOW_KEY_MAX - 1)
 
-enum
-{
+enum {
 	FLOW_MODE_MAP,
 	FLOW_MODE_HASH,
 };
 
-enum
-{
+enum {
 	TCA_FLOW_UNSPEC,
 	TCA_FLOW_KEYS,
 	TCA_FLOW_MODE,
@@ -383,8 +363,7 @@ enum
 
 /* Basic filter */
 
-enum
-{
+enum {
 	TCA_BASIC_UNSPEC,
 	TCA_BASIC_CLASSID,
 	TCA_BASIC_EMATCHES,
@@ -398,8 +377,7 @@ enum
 
 /* Cgroup classifier */
 
-enum
-{
+enum {
 	TCA_CGROUP_UNSPEC,
 	TCA_CGROUP_ACT,
 	TCA_CGROUP_POLICE,
@@ -411,14 +389,12 @@ enum
 
 /* Extended Matches */
 
-struct tcf_ematch_tree_hdr
-{
+struct tcf_ematch_tree_hdr {
 	__u16		nmatches;
 	__u16		progid;
 };
 
-enum
-{
+enum {
 	TCA_EMATCH_TREE_UNSPEC,
 	TCA_EMATCH_TREE_HDR,
 	TCA_EMATCH_TREE_LIST,
@@ -426,8 +402,7 @@ enum
 };
 #define TCA_EMATCH_TREE_MAX (__TCA_EMATCH_TREE_MAX - 1)
 
-struct tcf_ematch_hdr
-{
+struct tcf_ematch_hdr {
 	__u16		matchid;
 	__u16		kind;
 	__u16		flags;
@@ -457,8 +432,7 @@ struct tcf_ematch_hdr
 #define TCF_EM_REL_MASK	3
 #define TCF_EM_REL_VALID(v) (((v) & TCF_EM_REL_MASK) != TCF_EM_REL_MASK)
 
-enum
-{
+enum {
 	TCF_LAYER_LINK,
 	TCF_LAYER_NETWORK,
 	TCF_LAYER_TRANSPORT,
@@ -479,13 +453,11 @@ enum
 #define        TCF_EM_VLAN		6
 #define	TCF_EM_MAX		6
 
-enum
-{
+enum {
 	TCF_EM_PROG_TC
 };
 
-enum
-{
+enum {
 	TCF_EM_OPND_EQ,
 	TCF_EM_OPND_GT,
 	TCF_EM_OPND_LT
diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index d51a2b3e221e..2cfa4bc8dea6 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -29,8 +29,7 @@
    Particular schedulers may have also their private records.
  */
 
-struct tc_stats
-{
+struct tc_stats {
 	__u64	bytes;			/* NUmber of enqueues bytes */
 	__u32	packets;		/* Number of enqueued packets	*/
 	__u32	drops;			/* Packets dropped because of lack of resources */
@@ -42,8 +41,7 @@ struct tc_stats
 	__u32	backlog;
 };
 
-struct tc_estimator
-{
+struct tc_estimator {
 	signed char	interval;
 	unsigned char	ewma_log;
 };
@@ -75,8 +73,7 @@ struct tc_estimator
 #define TC_H_ROOT	(0xFFFFFFFFU)
 #define TC_H_INGRESS    (0xFFFFFFF1U)
 
-struct tc_ratespec
-{
+struct tc_ratespec {
 	unsigned char	cell_log;
 	unsigned char	__reserved;
 	unsigned short	overhead;
@@ -109,8 +106,7 @@ enum {
 
 /* FIFO section */
 
-struct tc_fifo_qopt
-{
+struct tc_fifo_qopt {
 	__u32	limit;	/* Queue length: bytes for bfifo, packets for pfifo */
 };
 
@@ -119,8 +115,7 @@ struct tc_fifo_qopt
 #define TCQ_PRIO_BANDS	16
 #define TCQ_MIN_PRIO_BANDS 2
 
-struct tc_prio_qopt
-{
+struct tc_prio_qopt {
 	int	bands;			/* Number of bands */
 	__u8	priomap[TC_PRIO_MAX+1];	/* Map: logical priority -> PRIO band */
 };
@@ -134,8 +129,7 @@ struct tc_multiq_qopt {
 
 /* TBF section */
 
-struct tc_tbf_qopt
-{
+struct tc_tbf_qopt {
 	struct tc_ratespec rate;
 	struct tc_ratespec peakrate;
 	__u32		limit;
@@ -143,8 +137,7 @@ struct tc_tbf_qopt
 	__u32		mtu;
 };
 
-enum
-{
+enum {
 	TCA_TBF_UNSPEC,
 	TCA_TBF_PARMS,
 	TCA_TBF_RTAB,
@@ -161,8 +154,7 @@ enum
 
 /* SFQ section */
 
-struct tc_sfq_qopt
-{
+struct tc_sfq_qopt {
 	unsigned	quantum;	/* Bytes per round allocated to flow */
 	int		perturb_period;	/* Period of hash perturbation */
 	__u32		limit;		/* Maximal packets in queue */
@@ -170,8 +162,7 @@ struct tc_sfq_qopt
 	unsigned	flows;		/* Maximal number of flows  */
 };
 
-struct tc_sfq_xstats
-{
+struct tc_sfq_xstats {
 	__s32		allot;
 };
 
@@ -186,8 +177,7 @@ struct tc_sfq_xstats
 
 /* RED section */
 
-enum
-{
+enum {
 	TCA_RED_UNSPEC,
 	TCA_RED_PARMS,
 	TCA_RED_STAB,
@@ -196,8 +186,7 @@ enum
 
 #define TCA_RED_MAX (__TCA_RED_MAX - 1)
 
-struct tc_red_qopt
-{
+struct tc_red_qopt {
 	__u32		limit;		/* HARD maximal queue length (bytes)	*/
 	__u32		qth_min;	/* Min average length threshold (bytes) */
 	__u32		qth_max;	/* Max average length threshold (bytes) */
@@ -209,8 +198,7 @@ struct tc_red_qopt
 #define TC_RED_HARDDROP	2
 };
 
-struct tc_red_xstats
-{
+struct tc_red_xstats {
 	__u32           early;          /* Early drops */
 	__u32           pdrop;          /* Drops due to queue limits */
 	__u32           other;          /* Drops due to drop() calls */
@@ -221,8 +209,7 @@ struct tc_red_xstats
 
 #define MAX_DPs 16
 
-enum
-{
+enum {
        TCA_GRED_UNSPEC,
        TCA_GRED_PARMS,
        TCA_GRED_STAB,
@@ -232,8 +219,7 @@ enum
 
 #define TCA_GRED_MAX (__TCA_GRED_MAX - 1)
 
-struct tc_gred_qopt
-{
+struct tc_gred_qopt {
 	__u32		limit;        /* HARD maximal queue length (bytes)    */
 	__u32		qth_min;      /* Min average length threshold (bytes) */
 	__u32		qth_max;      /* Max average length threshold (bytes) */
@@ -253,8 +239,7 @@ struct tc_gred_qopt
 };
 
 /* gred setup */
-struct tc_gred_sopt
-{
+struct tc_gred_sopt {
 	__u32		DPs;
 	__u32		def_DP;
 	__u8		grio;
@@ -267,8 +252,7 @@ struct tc_gred_sopt
 #define TC_HTB_MAXDEPTH		8
 #define TC_HTB_PROTOVER		3 /* the same as HTB and TC's major */
 
-struct tc_htb_opt
-{
+struct tc_htb_opt {
 	struct tc_ratespec 	rate;
 	struct tc_ratespec 	ceil;
 	__u32	buffer;
@@ -277,8 +261,7 @@ struct tc_htb_opt
 	__u32	level;		/* out only */
 	__u32	prio;
 };
-struct tc_htb_glob
-{
+struct tc_htb_glob {
 	__u32 version;		/* to match HTB/TC */
     	__u32 rate2quantum;	/* bps->quantum divisor */
     	__u32 defcls;		/* default class number */
@@ -287,8 +270,7 @@ struct tc_htb_glob
 	/* stats */
 	__u32 direct_pkts; /* count of non shapped packets */
 };
-enum
-{
+enum {
 	TCA_HTB_UNSPEC,
 	TCA_HTB_PARMS,
 	TCA_HTB_INIT,
@@ -299,8 +281,7 @@ enum
 
 #define TCA_HTB_MAX (__TCA_HTB_MAX - 1)
 
-struct tc_htb_xstats
-{
+struct tc_htb_xstats {
 	__u32 lends;
 	__u32 borrows;
 	__u32 giants;	/* too big packets (rate will not be accurate) */
@@ -310,28 +291,24 @@ struct tc_htb_xstats
 
 /* HFSC section */
 
-struct tc_hfsc_qopt
-{
+struct tc_hfsc_qopt {
 	__u16	defcls;		/* default class */
 };
 
-struct tc_service_curve
-{
+struct tc_service_curve {
 	__u32	m1;		/* slope of the first segment in bps */
 	__u32	d;		/* x-projection of the first segment in us */
 	__u32	m2;		/* slope of the second segment in bps */
 };
 
-struct tc_hfsc_stats
-{
+struct tc_hfsc_stats {
 	__u64	work;		/* total work done */
 	__u64	rtwork;		/* work done by real-time criteria */
 	__u32	period;		/* current period */
 	__u32	level;		/* class level in hierarchy */
 };
 
-enum
-{
+enum {
 	TCA_HFSC_UNSPEC,
 	TCA_HFSC_RSC,
 	TCA_HFSC_FSC,
@@ -348,8 +325,7 @@ enum
 #define TC_CBQ_MAXLEVEL		8
 #define TC_CBQ_DEF_EWMA		5
 
-struct tc_cbq_lssopt
-{
+struct tc_cbq_lssopt {
 	unsigned char	change;
 	unsigned char	flags;
 #define TCF_CBQ_LSS_BOUNDED	1
@@ -368,8 +344,7 @@ struct tc_cbq_lssopt
 	__u32		avpkt;
 };
 
-struct tc_cbq_wrropt
-{
+struct tc_cbq_wrropt {
 	unsigned char	flags;
 	unsigned char	priority;
 	unsigned char	cpriority;
@@ -378,8 +353,7 @@ struct tc_cbq_wrropt
 	__u32		weight;
 };
 
-struct tc_cbq_ovl
-{
+struct tc_cbq_ovl {
 	unsigned char	strategy;
 #define	TC_CBQ_OVL_CLASSIC	0
 #define	TC_CBQ_OVL_DELAY	1
@@ -391,30 +365,26 @@ struct tc_cbq_ovl
 	__u32		penalty;
 };
 
-struct tc_cbq_police
-{
+struct tc_cbq_police {
 	unsigned char	police;
 	unsigned char	__res1;
 	unsigned short	__res2;
 };
 
-struct tc_cbq_fopt
-{
+struct tc_cbq_fopt {
 	__u32		split;
 	__u32		defmap;
 	__u32		defchange;
 };
 
-struct tc_cbq_xstats
-{
+struct tc_cbq_xstats {
 	__u32		borrows;
 	__u32		overactions;
 	__s32		avgidle;
 	__s32		undertime;
 };
 
-enum
-{
+enum {
 	TCA_CBQ_UNSPEC,
 	TCA_CBQ_LSSOPT,
 	TCA_CBQ_WRROPT,
@@ -459,8 +429,7 @@ enum {
 
 /* Network emulator */
 
-enum
-{
+enum {
 	TCA_NETEM_UNSPEC,
 	TCA_NETEM_CORR,
 	TCA_NETEM_DELAY_DIST,
@@ -471,8 +440,7 @@ enum
 
 #define TCA_NETEM_MAX (__TCA_NETEM_MAX - 1)
 
-struct tc_netem_qopt
-{
+struct tc_netem_qopt {
 	__u32	latency;	/* added delay (us) */
 	__u32   limit;		/* fifo limit (packets) */
 	__u32	loss;		/* random packet loss (0=none ~0=100%) */
@@ -481,21 +449,18 @@ struct tc_netem_qopt
 	__u32	jitter;		/* random jitter in latency (us) */
 };
 
-struct tc_netem_corr
-{
+struct tc_netem_corr {
 	__u32	delay_corr;	/* delay correlation */
 	__u32	loss_corr;	/* packet loss correlation */
 	__u32	dup_corr;	/* duplicate correlation  */
 };
 
-struct tc_netem_reorder
-{
+struct tc_netem_reorder {
 	__u32	probability;
 	__u32	correlation;
 };
 
-struct tc_netem_corrupt
-{
+struct tc_netem_corrupt {
 	__u32	probability;
 	__u32	correlation;
 };
@@ -504,8 +469,7 @@ struct tc_netem_corrupt
 
 /* DRR */
 
-enum
-{
+enum {
 	TCA_DRR_UNSPEC,
 	TCA_DRR_QUANTUM,
 	__TCA_DRR_MAX
@@ -513,8 +477,7 @@ enum
 
 #define TCA_DRR_MAX	(__TCA_DRR_MAX - 1)
 
-struct tc_drr_stats
-{
+struct tc_drr_stats {
 	__u32	deficit;
 };
 
diff --git a/include/linux/route.h b/include/linux/route.h
index f7ed35d5e653..6600708311c8 100644
--- a/include/linux/route.h
+++ b/include/linux/route.h
@@ -27,8 +27,7 @@
 #include <linux/compiler.h>
 
 /* This structure gets passed by the SIOCADDRT and SIOCDELRT calls. */
-struct rtentry 
-{
+struct rtentry {
 	unsigned long	rt_pad1;
 	struct sockaddr	rt_dst;		/* target address		*/
 	struct sockaddr	rt_gateway;	/* gateway addr (RTF_GATEWAY)	*/
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index e78b60cd65a4..14fc906ed602 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -127,8 +127,7 @@ enum {
    with attribute type.
  */
 
-struct rtattr
-{
+struct rtattr {
 	unsigned short	rta_len;
 	unsigned short	rta_type;
 };
@@ -154,8 +153,7 @@ struct rtattr
  *		Definitions used in routing table administration.
  ****/
 
-struct rtmsg
-{
+struct rtmsg {
 	unsigned char		rtm_family;
 	unsigned char		rtm_dst_len;
 	unsigned char		rtm_src_len;
@@ -171,8 +169,7 @@ struct rtmsg
 
 /* rtm_type */
 
-enum
-{
+enum {
 	RTN_UNSPEC,
 	RTN_UNICAST,		/* Gateway or direct route	*/
 	RTN_LOCAL,		/* Accept locally		*/
@@ -230,8 +227,7 @@ enum
    could be assigned a value between UNIVERSE and LINK.
 */
 
-enum rt_scope_t
-{
+enum rt_scope_t {
 	RT_SCOPE_UNIVERSE=0,
 /* User defined values  */
 	RT_SCOPE_SITE=200,
@@ -249,8 +245,7 @@ enum rt_scope_t
 
 /* Reserved table identifiers */
 
-enum rt_class_t
-{
+enum rt_class_t {
 	RT_TABLE_UNSPEC=0,
 /* User defined values */
 	RT_TABLE_COMPAT=252,
@@ -263,8 +258,7 @@ enum rt_class_t
 
 /* Routing message attributes */
 
-enum rtattr_type_t
-{
+enum rtattr_type_t {
 	RTA_UNSPEC,
 	RTA_DST,
 	RTA_SRC,
@@ -298,8 +292,7 @@ enum rtattr_type_t
  * and rtt for different paths from multipath.
  */
 
-struct rtnexthop
-{
+struct rtnexthop {
 	unsigned short		rtnh_len;
 	unsigned char		rtnh_flags;
 	unsigned char		rtnh_hops;
@@ -325,8 +318,7 @@ struct rtnexthop
 
 /* RTM_CACHEINFO */
 
-struct rta_cacheinfo
-{
+struct rta_cacheinfo {
 	__u32	rta_clntref;
 	__u32	rta_lastuse;
 	__s32	rta_expires;
@@ -341,8 +333,7 @@ struct rta_cacheinfo
 
 /* RTM_METRICS --- array of struct rtattr with types of RTAX_* */
 
-enum
-{
+enum {
 	RTAX_UNSPEC,
 #define RTAX_UNSPEC RTAX_UNSPEC
 	RTAX_LOCK,
@@ -383,8 +374,7 @@ enum
 #define RTAX_FEATURE_NO_WSCALE	0x00000010
 #define RTAX_FEATURE_NO_DSACK	0x00000020
 
-struct rta_session
-{
+struct rta_session {
 	__u8	proto;
 	__u8	pad1;
 	__u16	pad2;
@@ -409,8 +399,7 @@ struct rta_session
  *		General form of address family dependent message.
  ****/
 
-struct rtgenmsg
-{
+struct rtgenmsg {
 	unsigned char		rtgen_family;
 };
 
@@ -423,8 +412,7 @@ struct rtgenmsg
  * on network protocol.
  */
 
-struct ifinfomsg
-{
+struct ifinfomsg {
 	unsigned char	ifi_family;
 	unsigned char	__ifi_pad;
 	unsigned short	ifi_type;		/* ARPHRD_* */
@@ -437,8 +425,7 @@ struct ifinfomsg
  *		prefix information 
  ****/
 
-struct prefixmsg
-{
+struct prefixmsg {
 	unsigned char	prefix_family;
 	unsigned char	prefix_pad1;
 	unsigned short	prefix_pad2;
@@ -459,8 +446,7 @@ enum
 
 #define PREFIX_MAX	(__PREFIX_MAX - 1)
 
-struct prefix_cacheinfo
-{
+struct prefix_cacheinfo {
 	__u32	preferred_time;
 	__u32	valid_time;
 };
@@ -470,8 +456,7 @@ struct prefix_cacheinfo
  *		Traffic control messages.
  ****/
 
-struct tcmsg
-{
+struct tcmsg {
 	unsigned char	tcm_family;
 	unsigned char	tcm__pad1;
 	unsigned short	tcm__pad2;
@@ -481,8 +466,7 @@ struct tcmsg
 	__u32		tcm_info;
 };
 
-enum
-{
+enum {
 	TCA_UNSPEC,
 	TCA_KIND,
 	TCA_OPTIONS,
@@ -504,8 +488,7 @@ enum
  *		Neighbor Discovery userland options
  ****/
 
-struct nduseroptmsg
-{
+struct nduseroptmsg {
 	unsigned char	nduseropt_family;
 	unsigned char	nduseropt_pad1;
 	unsigned short	nduseropt_opts_len;	/* Total length of options */
@@ -517,8 +500,7 @@ struct nduseroptmsg
 	/* Followed by one or more ND options */
 };
 
-enum
-{
+enum {
 	NDUSEROPT_UNSPEC,
 	NDUSEROPT_SRCADDR,
 	__NDUSEROPT_MAX
@@ -600,8 +582,7 @@ enum rtnetlink_groups {
 #define RTNLGRP_MAX	(__RTNLGRP_MAX - 1)
 
 /* TC action piece */
-struct tcamsg
-{
+struct tcamsg {
 	unsigned char	tca_family;
 	unsigned char	tca__pad1;
 	unsigned short	tca__pad2;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0c68fbd6faac..d0448c5d1ffc 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -491,8 +491,7 @@ extern int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
 			int len,int odd, struct sk_buff *skb),
 			void *from, int length);
 
-struct skb_seq_state
-{
+struct skb_seq_state {
 	__u32		lower_offset;
 	__u32		upper_offset;
 	__u32		frag_idx;
diff --git a/include/linux/tc_act/tc_defact.h b/include/linux/tc_act/tc_defact.h
index 964f473af0f0..6f65d07c7ce2 100644
--- a/include/linux/tc_act/tc_defact.h
+++ b/include/linux/tc_act/tc_defact.h
@@ -3,13 +3,11 @@
 
 #include <linux/pkt_cls.h>
 
-struct tc_defact
-{
+struct tc_defact {
 	tc_gen;
 };
                                                                                 
-enum
-{
+enum {
 	TCA_DEF_UNSPEC,
 	TCA_DEF_TM,
 	TCA_DEF_PARMS,
diff --git a/include/linux/tc_act/tc_gact.h b/include/linux/tc_act/tc_gact.h
index e895c0a39629..f7bf94eed510 100644
--- a/include/linux/tc_act/tc_gact.h
+++ b/include/linux/tc_act/tc_gact.h
@@ -5,14 +5,12 @@
 #include <linux/pkt_cls.h>
 
 #define TCA_ACT_GACT 5
-struct tc_gact
-{
+struct tc_gact {
 	tc_gen;
 
 };
 
-struct tc_gact_p
-{
+struct tc_gact_p {
 #define PGACT_NONE              0
 #define PGACT_NETRAND           1
 #define PGACT_DETERM            2
@@ -22,8 +20,7 @@ struct tc_gact_p
 	int                   paction;
 };
  
-enum
-{
+enum {
 	TCA_GACT_UNSPEC,
 	TCA_GACT_TM,
 	TCA_GACT_PARMS,
diff --git a/include/linux/tc_act/tc_ipt.h b/include/linux/tc_act/tc_ipt.h
index 4b6f7b6c7a79..a2335563d21f 100644
--- a/include/linux/tc_act/tc_ipt.h
+++ b/include/linux/tc_act/tc_ipt.h
@@ -5,8 +5,7 @@
 
 #define TCA_ACT_IPT 6
 
-enum
-{
+enum {
 	TCA_IPT_UNSPEC,
 	TCA_IPT_TABLE,
 	TCA_IPT_HOOK,
diff --git a/include/linux/tc_act/tc_mirred.h b/include/linux/tc_act/tc_mirred.h
index 0a99ab60d610..7561750e8fd6 100644
--- a/include/linux/tc_act/tc_mirred.h
+++ b/include/linux/tc_act/tc_mirred.h
@@ -10,15 +10,13 @@
 #define TCA_INGRESS_REDIR 3  /* packet redirect to INGRESS*/
 #define TCA_INGRESS_MIRROR 4 /* mirror packet to INGRESS */
                                                                                 
-struct tc_mirred
-{
+struct tc_mirred {
 	tc_gen;
 	int                     eaction;   /* one of IN/EGRESS_MIRROR/REDIR */
 	__u32                   ifindex;  /* ifindex of egress port */
 };
                                                                                 
-enum
-{
+enum {
 	TCA_MIRRED_UNSPEC,
 	TCA_MIRRED_TM,
 	TCA_MIRRED_PARMS,
diff --git a/include/linux/tc_act/tc_nat.h b/include/linux/tc_act/tc_nat.h
index e7cf31e8ba79..6663aeba0b9a 100644
--- a/include/linux/tc_act/tc_nat.h
+++ b/include/linux/tc_act/tc_nat.h
@@ -6,8 +6,7 @@
 
 #define TCA_ACT_NAT 9
 
-enum
-{
+enum {
 	TCA_NAT_UNSPEC,
 	TCA_NAT_PARMS,
 	TCA_NAT_TM,
@@ -17,8 +16,7 @@ enum
 
 #define TCA_NAT_FLAG_EGRESS 1
 
-struct tc_nat
-{
+struct tc_nat {
 	tc_gen;
 	__be32 old_addr;
 	__be32 new_addr;
diff --git a/include/linux/tc_act/tc_pedit.h b/include/linux/tc_act/tc_pedit.h
index 54ce9064115a..716cfabcd5b2 100644
--- a/include/linux/tc_act/tc_pedit.h
+++ b/include/linux/tc_act/tc_pedit.h
@@ -6,8 +6,7 @@
 
 #define TCA_ACT_PEDIT 7
 
-enum
-{
+enum {
 	TCA_PEDIT_UNSPEC,
 	TCA_PEDIT_TM,
 	TCA_PEDIT_PARMS,
@@ -15,8 +14,7 @@ enum
 };
 #define TCA_PEDIT_MAX (__TCA_PEDIT_MAX - 1)
                                                                                 
-struct tc_pedit_key
-{
+struct tc_pedit_key {
 	__u32           mask;  /* AND */
 	__u32           val;   /*XOR */
 	__u32           off;  /*offset */
@@ -25,8 +23,7 @@ struct tc_pedit_key
 	__u32           shift;
 };
                                                                                 
-struct tc_pedit_sel
-{
+struct tc_pedit_sel {
 	tc_gen;
 	unsigned char           nkeys;
 	unsigned char           flags;
diff --git a/include/linux/tc_ematch/tc_em_cmp.h b/include/linux/tc_ematch/tc_em_cmp.h
index 38e7f7b25ec2..f34bb1bae083 100644
--- a/include/linux/tc_ematch/tc_em_cmp.h
+++ b/include/linux/tc_ematch/tc_em_cmp.h
@@ -4,8 +4,7 @@
 #include <linux/types.h>
 #include <linux/pkt_cls.h>
 
-struct tcf_em_cmp
-{
+struct tcf_em_cmp {
 	__u32		val;
 	__u32		mask;
 	__u16		off;
@@ -15,8 +14,7 @@ struct tcf_em_cmp
 	__u8		opnd:4;
 };
 
-enum
-{
+enum {
 	TCF_EM_ALIGN_U8  = 1,
 	TCF_EM_ALIGN_U16 = 2,
 	TCF_EM_ALIGN_U32 = 4
diff --git a/include/linux/tc_ematch/tc_em_meta.h b/include/linux/tc_ematch/tc_em_meta.h
index dcfb733fa1f6..0864206ec1a3 100644
--- a/include/linux/tc_ematch/tc_em_meta.h
+++ b/include/linux/tc_ematch/tc_em_meta.h
@@ -4,8 +4,7 @@
 #include <linux/types.h>
 #include <linux/pkt_cls.h>
 
-enum
-{
+enum {
 	TCA_EM_META_UNSPEC,
 	TCA_EM_META_HDR,
 	TCA_EM_META_LVALUE,
@@ -14,8 +13,7 @@ enum
 };
 #define TCA_EM_META_MAX (__TCA_EM_META_MAX - 1)
 
-struct tcf_meta_val
-{
+struct tcf_meta_val {
 	__u16			kind;
 	__u8			shift;
 	__u8			op;
@@ -26,16 +24,14 @@ struct tcf_meta_val
 #define TCF_META_ID_MASK	0x7ff
 #define TCF_META_ID(kind)	((kind) & TCF_META_ID_MASK)
 
-enum
-{
+enum {
 	TCF_META_TYPE_VAR,
 	TCF_META_TYPE_INT,
 	__TCF_META_TYPE_MAX
 };
 #define TCF_META_TYPE_MAX (__TCF_META_TYPE_MAX - 1)
 
-enum
-{
+enum {
 	TCF_META_ID_VALUE,
 	TCF_META_ID_RANDOM,
 	TCF_META_ID_LOADAVG_0,
@@ -87,8 +83,7 @@ enum
 };
 #define TCF_META_ID_MAX (__TCF_META_ID_MAX - 1)
 
-struct tcf_meta_hdr
-{
+struct tcf_meta_hdr {
 	struct tcf_meta_val	left;
 	struct tcf_meta_val	right;
 };
diff --git a/include/linux/tc_ematch/tc_em_nbyte.h b/include/linux/tc_ematch/tc_em_nbyte.h
index 9ed8c2e58488..7172cfb999c1 100644
--- a/include/linux/tc_ematch/tc_em_nbyte.h
+++ b/include/linux/tc_ematch/tc_em_nbyte.h
@@ -4,8 +4,7 @@
 #include <linux/types.h>
 #include <linux/pkt_cls.h>
 
-struct tcf_em_nbyte
-{
+struct tcf_em_nbyte {
 	__u16		off;
 	__u16		len:12;
 	__u8		layer:4;
diff --git a/include/linux/tc_ematch/tc_em_text.h b/include/linux/tc_ematch/tc_em_text.h
index d12a73a225fc..5aac4045ba88 100644
--- a/include/linux/tc_ematch/tc_em_text.h
+++ b/include/linux/tc_ematch/tc_em_text.h
@@ -6,8 +6,7 @@
 
 #define TC_EM_TEXT_ALGOSIZ	16
 
-struct tcf_em_text
-{
+struct tcf_em_text {
 	char		algo[TC_EM_TEXT_ALGOSIZ];
 	__u16		from_offset;
 	__u16		to_offset;
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 61723a7c21fe..eeecb8547a2a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -102,8 +102,7 @@ enum {
 #define TCPI_OPT_WSCALE		4
 #define TCPI_OPT_ECN		8
 
-enum tcp_ca_state
-{
+enum tcp_ca_state {
 	TCP_CA_Open = 0,
 #define TCPF_CA_Open	(1<<TCP_CA_Open)
 	TCP_CA_Disorder = 1,
@@ -116,8 +115,7 @@ enum tcp_ca_state
 #define TCPF_CA_Loss	(1<<TCP_CA_Loss)
 };
 
-struct tcp_info
-{
+struct tcp_info {
 	__u8	tcpi_state;
 	__u8	tcpi_ca_state;
 	__u8	tcpi_retransmits;
diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 2d4ec15abaca..3246f0e66bcc 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -10,8 +10,7 @@
 /* Structure to encapsulate addresses. I do not want to use
  * "standard" structure. My apologies.
  */
-typedef union
-{
+typedef union {
 	__be32		a4;
 	__be32		a6[4];
 } xfrm_address_t;
@@ -20,8 +19,7 @@ typedef union
  * the state by (spi,daddr,ah/esp) or to store information about
  * spi, protocol and tunnel address on output.
  */
-struct xfrm_id
-{
+struct xfrm_id {
 	xfrm_address_t	daddr;
 	__be32		spi;
 	__u8		proto;
@@ -45,8 +43,7 @@ struct xfrm_sec_ctx {
 
 /* Selector, used as selector both on policy rules (SPD) and SAs. */
 
-struct xfrm_selector
-{
+struct xfrm_selector {
 	xfrm_address_t	daddr;
 	xfrm_address_t	saddr;
 	__be16	dport;
@@ -63,8 +60,7 @@ struct xfrm_selector
 
 #define XFRM_INF (~(__u64)0)
 
-struct xfrm_lifetime_cfg
-{
+struct xfrm_lifetime_cfg {
 	__u64	soft_byte_limit;
 	__u64	hard_byte_limit;
 	__u64	soft_packet_limit;
@@ -75,16 +71,14 @@ struct xfrm_lifetime_cfg
 	__u64	hard_use_expires_seconds;
 };
 
-struct xfrm_lifetime_cur
-{
+struct xfrm_lifetime_cur {
 	__u64	bytes;
 	__u64	packets;
 	__u64	add_time;
 	__u64	use_time;
 };
 
-struct xfrm_replay_state
-{
+struct xfrm_replay_state {
 	__u32	oseq;
 	__u32	seq;
 	__u32	bitmap;
@@ -109,16 +103,14 @@ struct xfrm_stats {
 	__u32	integrity_failed;
 };
 
-enum
-{
+enum {
 	XFRM_POLICY_TYPE_MAIN	= 0,
 	XFRM_POLICY_TYPE_SUB	= 1,
 	XFRM_POLICY_TYPE_MAX	= 2,
 	XFRM_POLICY_TYPE_ANY	= 255
 };
 
-enum
-{
+enum {
 	XFRM_POLICY_IN	= 0,
 	XFRM_POLICY_OUT	= 1,
 	XFRM_POLICY_FWD	= 2,
@@ -126,8 +118,7 @@ enum
 	XFRM_POLICY_MAX	= 3
 };
 
-enum
-{
+enum {
 	XFRM_SHARE_ANY,		/* No limitations */
 	XFRM_SHARE_SESSION,	/* For this session only */
 	XFRM_SHARE_USER,	/* For this user only */
-- 
cgit v1.2.3


From 2da3e160cb3d226d87b907fab26850d838ed8d7c Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 5 Nov 2009 23:06:50 +0100
Subject: hw-breakpoint: Move asm-generic/hw_breakpoint.h to
 linux/hw_breakpoint.h

We plan to make the breakpoints parameters generic among architectures.
For that it's better to move the asm-generic header to a generic linux
header.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/x86/include/asm/hw_breakpoint.h |   2 +-
 include/asm-generic/hw_breakpoint.h  | 139 -----------------------------------
 include/linux/hw_breakpoint.h        | 136 ++++++++++++++++++++++++++++++++++
 3 files changed, 137 insertions(+), 140 deletions(-)
 delete mode 100644 include/asm-generic/hw_breakpoint.h
 create mode 100644 include/linux/hw_breakpoint.h

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
index 1acb4d45de70..3cfca8e2b5f6 100644
--- a/arch/x86/include/asm/hw_breakpoint.h
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -12,7 +12,7 @@ struct arch_hw_breakpoint {
 };
 
 #include <linux/kdebug.h>
-#include <asm-generic/hw_breakpoint.h>
+#include <linux/hw_breakpoint.h>
 
 /* Available HW breakpoint length encodings */
 #define HW_BREAKPOINT_LEN_1		0x40
diff --git a/include/asm-generic/hw_breakpoint.h b/include/asm-generic/hw_breakpoint.h
deleted file mode 100644
index 9bf2d12eb74a..000000000000
--- a/include/asm-generic/hw_breakpoint.h
+++ /dev/null
@@ -1,139 +0,0 @@
-#ifndef	_ASM_GENERIC_HW_BREAKPOINT_H
-#define	_ASM_GENERIC_HW_BREAKPOINT_H
-
-#ifndef __ARCH_HW_BREAKPOINT_H
-#error "Please don't include this file directly"
-#endif
-
-#ifdef	__KERNEL__
-#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/kallsyms.h>
-
-/**
- * struct hw_breakpoint - unified kernel/user-space hardware breakpoint
- * @triggered: callback invoked after target address access
- * @info: arch-specific breakpoint info (address, length, and type)
- *
- * %hw_breakpoint structures are the kernel's way of representing
- * hardware breakpoints.  These are data breakpoints
- * (also known as "watchpoints", triggered on data access), and the breakpoint's
- * target address can be located in either kernel space or user space.
- *
- * The breakpoint's address, length, and type are highly
- * architecture-specific.  The values are encoded in the @info field; you
- * specify them when registering the breakpoint.  To examine the encoded
- * values use hw_breakpoint_get_{kaddress,uaddress,len,type}(), declared
- * below.
- *
- * The address is specified as a regular kernel pointer (for kernel-space
- * breakponts) or as an %__user pointer (for user-space breakpoints).
- * With register_user_hw_breakpoint(), the address must refer to a
- * location in user space.  The breakpoint will be active only while the
- * requested task is running.  Conversely with
- * register_kernel_hw_breakpoint(), the address must refer to a location
- * in kernel space, and the breakpoint will be active on all CPUs
- * regardless of the current task.
- *
- * The length is the breakpoint's extent in bytes, which is subject to
- * certain limitations.  include/asm/hw_breakpoint.h contains macros
- * defining the available lengths for a specific architecture.  Note that
- * the address's alignment must match the length.  The breakpoint will
- * catch accesses to any byte in the range from address to address +
- * (length - 1).
- *
- * The breakpoint's type indicates the sort of access that will cause it
- * to trigger.  Possible values may include:
- *
- * 	%HW_BREAKPOINT_RW (triggered on read or write access),
- * 	%HW_BREAKPOINT_WRITE (triggered on write access), and
- * 	%HW_BREAKPOINT_READ (triggered on read access).
- *
- * Appropriate macros are defined in include/asm/hw_breakpoint.h; not all
- * possibilities are available on all architectures.  Execute breakpoints
- * must have length equal to the special value %HW_BREAKPOINT_LEN_EXECUTE.
- *
- * When a breakpoint gets hit, the @triggered callback is
- * invoked in_interrupt with a pointer to the %hw_breakpoint structure and the
- * processor registers.
- * Data breakpoints occur after the memory access has taken place.
- * Breakpoints are disabled during execution @triggered, to avoid
- * recursive traps and allow unhindered access to breakpointed memory.
- *
- * This sample code sets a breakpoint on pid_max and registers a callback
- * function for writes to that variable.  Note that it is not portable
- * as written, because not all architectures support HW_BREAKPOINT_LEN_4.
- *
- * ----------------------------------------------------------------------
- *
- * #include <asm/hw_breakpoint.h>
- *
- * struct hw_breakpoint my_bp;
- *
- * static void my_triggered(struct hw_breakpoint *bp, struct pt_regs *regs)
- * {
- * 	printk(KERN_DEBUG "Inside triggered routine of breakpoint exception\n");
- * 	dump_stack();
- *  	.......<more debugging output>........
- * }
- *
- * static struct hw_breakpoint my_bp;
- *
- * static int init_module(void)
- * {
- *	..........<do anything>............
- *	my_bp.info.type = HW_BREAKPOINT_WRITE;
- *	my_bp.info.len = HW_BREAKPOINT_LEN_4;
- *
- *	my_bp.installed = (void *)my_bp_installed;
- *
- *	rc = register_kernel_hw_breakpoint(&my_bp);
- *	..........<do anything>............
- * }
- *
- * static void cleanup_module(void)
- * {
- *	..........<do anything>............
- *	unregister_kernel_hw_breakpoint(&my_bp);
- *	..........<do anything>............
- * }
- *
- * ----------------------------------------------------------------------
- */
-struct hw_breakpoint {
-	void (*triggered)(struct hw_breakpoint *, struct pt_regs *);
-	struct arch_hw_breakpoint info;
-};
-
-/*
- * len and type values are defined in include/asm/hw_breakpoint.h.
- * Available values vary according to the architecture.  On i386 the
- * possibilities are:
- *
- *	HW_BREAKPOINT_LEN_1
- *	HW_BREAKPOINT_LEN_2
- *	HW_BREAKPOINT_LEN_4
- *	HW_BREAKPOINT_RW
- *	HW_BREAKPOINT_READ
- *
- * On other architectures HW_BREAKPOINT_LEN_8 may be available, and the
- * 1-, 2-, and 4-byte lengths may be unavailable.  There also may be
- * HW_BREAKPOINT_WRITE.  You can use #ifdef to check at compile time.
- */
-
-extern int register_user_hw_breakpoint(struct task_struct *tsk,
-					struct hw_breakpoint *bp);
-extern int modify_user_hw_breakpoint(struct task_struct *tsk,
-					struct hw_breakpoint *bp);
-extern void unregister_user_hw_breakpoint(struct task_struct *tsk,
-						struct hw_breakpoint *bp);
-/*
- * Kernel breakpoints are not associated with any particular thread.
- */
-extern int register_kernel_hw_breakpoint(struct hw_breakpoint *bp);
-extern void unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp);
-
-extern unsigned int hbp_kernel_pos;
-
-#endif	/* __KERNEL__ */
-#endif	/* _ASM_GENERIC_HW_BREAKPOINT_H */
diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
new file mode 100644
index 000000000000..61ccc8f17eac
--- /dev/null
+++ b/include/linux/hw_breakpoint.h
@@ -0,0 +1,136 @@
+#ifndef _LINUX_HW_BREAKPOINT_H
+#define _LINUX_HW_BREAKPOINT_H
+
+
+#ifdef	__KERNEL__
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/kallsyms.h>
+
+/**
+ * struct hw_breakpoint - unified kernel/user-space hardware breakpoint
+ * @triggered: callback invoked after target address access
+ * @info: arch-specific breakpoint info (address, length, and type)
+ *
+ * %hw_breakpoint structures are the kernel's way of representing
+ * hardware breakpoints.  These are data breakpoints
+ * (also known as "watchpoints", triggered on data access), and the breakpoint's
+ * target address can be located in either kernel space or user space.
+ *
+ * The breakpoint's address, length, and type are highly
+ * architecture-specific.  The values are encoded in the @info field; you
+ * specify them when registering the breakpoint.  To examine the encoded
+ * values use hw_breakpoint_get_{kaddress,uaddress,len,type}(), declared
+ * below.
+ *
+ * The address is specified as a regular kernel pointer (for kernel-space
+ * breakponts) or as an %__user pointer (for user-space breakpoints).
+ * With register_user_hw_breakpoint(), the address must refer to a
+ * location in user space.  The breakpoint will be active only while the
+ * requested task is running.  Conversely with
+ * register_kernel_hw_breakpoint(), the address must refer to a location
+ * in kernel space, and the breakpoint will be active on all CPUs
+ * regardless of the current task.
+ *
+ * The length is the breakpoint's extent in bytes, which is subject to
+ * certain limitations.  include/asm/hw_breakpoint.h contains macros
+ * defining the available lengths for a specific architecture.  Note that
+ * the address's alignment must match the length.  The breakpoint will
+ * catch accesses to any byte in the range from address to address +
+ * (length - 1).
+ *
+ * The breakpoint's type indicates the sort of access that will cause it
+ * to trigger.  Possible values may include:
+ *
+ * 	%HW_BREAKPOINT_RW (triggered on read or write access),
+ * 	%HW_BREAKPOINT_WRITE (triggered on write access), and
+ * 	%HW_BREAKPOINT_READ (triggered on read access).
+ *
+ * Appropriate macros are defined in include/asm/hw_breakpoint.h; not all
+ * possibilities are available on all architectures.  Execute breakpoints
+ * must have length equal to the special value %HW_BREAKPOINT_LEN_EXECUTE.
+ *
+ * When a breakpoint gets hit, the @triggered callback is
+ * invoked in_interrupt with a pointer to the %hw_breakpoint structure and the
+ * processor registers.
+ * Data breakpoints occur after the memory access has taken place.
+ * Breakpoints are disabled during execution @triggered, to avoid
+ * recursive traps and allow unhindered access to breakpointed memory.
+ *
+ * This sample code sets a breakpoint on pid_max and registers a callback
+ * function for writes to that variable.  Note that it is not portable
+ * as written, because not all architectures support HW_BREAKPOINT_LEN_4.
+ *
+ * ----------------------------------------------------------------------
+ *
+ * #include <asm/hw_breakpoint.h>
+ *
+ * struct hw_breakpoint my_bp;
+ *
+ * static void my_triggered(struct hw_breakpoint *bp, struct pt_regs *regs)
+ * {
+ * 	printk(KERN_DEBUG "Inside triggered routine of breakpoint exception\n");
+ * 	dump_stack();
+ *  	.......<more debugging output>........
+ * }
+ *
+ * static struct hw_breakpoint my_bp;
+ *
+ * static int init_module(void)
+ * {
+ *	..........<do anything>............
+ *	my_bp.info.type = HW_BREAKPOINT_WRITE;
+ *	my_bp.info.len = HW_BREAKPOINT_LEN_4;
+ *
+ *	my_bp.installed = (void *)my_bp_installed;
+ *
+ *	rc = register_kernel_hw_breakpoint(&my_bp);
+ *	..........<do anything>............
+ * }
+ *
+ * static void cleanup_module(void)
+ * {
+ *	..........<do anything>............
+ *	unregister_kernel_hw_breakpoint(&my_bp);
+ *	..........<do anything>............
+ * }
+ *
+ * ----------------------------------------------------------------------
+ */
+struct hw_breakpoint {
+	void (*triggered)(struct hw_breakpoint *, struct pt_regs *);
+	struct arch_hw_breakpoint info;
+};
+
+/*
+ * len and type values are defined in include/asm/hw_breakpoint.h.
+ * Available values vary according to the architecture.  On i386 the
+ * possibilities are:
+ *
+ *	HW_BREAKPOINT_LEN_1
+ *	HW_BREAKPOINT_LEN_2
+ *	HW_BREAKPOINT_LEN_4
+ *	HW_BREAKPOINT_RW
+ *	HW_BREAKPOINT_READ
+ *
+ * On other architectures HW_BREAKPOINT_LEN_8 may be available, and the
+ * 1-, 2-, and 4-byte lengths may be unavailable.  There also may be
+ * HW_BREAKPOINT_WRITE.  You can use #ifdef to check at compile time.
+ */
+
+extern int register_user_hw_breakpoint(struct task_struct *tsk,
+					struct hw_breakpoint *bp);
+extern int modify_user_hw_breakpoint(struct task_struct *tsk,
+					struct hw_breakpoint *bp);
+extern void unregister_user_hw_breakpoint(struct task_struct *tsk,
+						struct hw_breakpoint *bp);
+/*
+ * Kernel breakpoints are not associated with any particular thread.
+ */
+extern int register_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+extern void unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+
+extern unsigned int hbp_kernel_pos;
+
+#endif	/* __KERNEL__ */
+#endif	/* _LINUX_HW_BREAKPOINT_H */
-- 
cgit v1.2.3


From 13f18aa05f5abe135f47b6417537ae2b2fedc18c Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 5 Nov 2009 20:44:37 -0800
Subject: net: drop capability from protocol definitions

struct can_proto had a capability field which wasn't ever used.  It is
dropped entirely.

struct inet_protosw had a capability field which can be more clearly
expressed in the code by just checking if sock->type = SOCK_RAW.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/can/core.h | 2 --
 include/net/protocol.h   | 4 ----
 net/can/af_can.c         | 5 -----
 net/can/bcm.c            | 1 -
 net/can/raw.c            | 1 -
 net/dccp/ipv4.c          | 1 -
 net/dccp/ipv6.c          | 1 -
 net/ipv4/af_inet.c       | 5 +----
 net/ipv4/udplite.c       | 1 -
 net/ipv6/af_inet6.c      | 2 +-
 net/ipv6/raw.c           | 1 -
 net/ipv6/tcp_ipv6.c      | 1 -
 net/ipv6/udp.c           | 1 -
 net/ipv6/udplite.c       | 1 -
 net/sctp/ipv6.c          | 2 --
 net/sctp/protocol.c      | 2 --
 16 files changed, 2 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/can/core.h b/include/linux/can/core.h
index 25085cbadcfc..6c507bea275f 100644
--- a/include/linux/can/core.h
+++ b/include/linux/can/core.h
@@ -32,14 +32,12 @@
  * struct can_proto - CAN protocol structure
  * @type:       type argument in socket() syscall, e.g. SOCK_DGRAM.
  * @protocol:   protocol number in socket() syscall.
- * @capability: capability needed to open the socket, or -1 for no restriction.
  * @ops:        pointer to struct proto_ops for sock->ops.
  * @prot:       pointer to struct proto structure.
  */
 struct can_proto {
 	int              type;
 	int              protocol;
-	int              capability;
 	struct proto_ops *ops;
 	struct proto     *prot;
 };
diff --git a/include/net/protocol.h b/include/net/protocol.h
index 89932d45da59..f1effdd3c265 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -82,10 +82,6 @@ struct inet_protosw {
 	struct proto	 *prot;
 	const struct proto_ops *ops;
   
-	int              capability; /* Which (if any) capability do
-				      * we need to use this socket
-				      * interface?
-                                      */
 	char             no_check;   /* checksum on rcv/xmit/none? */
 	unsigned char	 flags;      /* See INET_PROTOSW_* below.  */
 };
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 3f2eb27e1ffb..9c0426dc3184 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -160,11 +160,6 @@ static int can_create(struct net *net, struct socket *sock, int protocol)
 		goto errout;
 	}
 
-	if (cp->capability >= 0 && !capable(cp->capability)) {
-		err = -EPERM;
-		goto errout;
-	}
-
 	sock->ops = cp->ops;
 
 	sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot);
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 2f47039c79dd..67b5433db13b 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -1576,7 +1576,6 @@ static struct proto bcm_proto __read_mostly = {
 static struct can_proto bcm_can_proto __read_mostly = {
 	.type       = SOCK_DGRAM,
 	.protocol   = CAN_BCM,
-	.capability = -1,
 	.ops        = &bcm_ops,
 	.prot       = &bcm_proto,
 };
diff --git a/net/can/raw.c b/net/can/raw.c
index 6e77db58b9e6..abca920440b5 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -742,7 +742,6 @@ static struct proto raw_proto __read_mostly = {
 static struct can_proto raw_can_proto __read_mostly = {
 	.type       = SOCK_RAW,
 	.protocol   = CAN_RAW,
-	.capability = -1,
 	.ops        = &raw_ops,
 	.prot       = &raw_proto,
 };
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 00028d4b09d9..2423a0866733 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -991,7 +991,6 @@ static struct inet_protosw dccp_v4_protosw = {
 	.protocol	= IPPROTO_DCCP,
 	.prot		= &dccp_v4_prot,
 	.ops		= &inet_dccp_ops,
-	.capability	= -1,
 	.no_check	= 0,
 	.flags		= INET_PROTOSW_ICSK,
 };
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 6d89f9f7d5d8..50ea91a77705 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1185,7 +1185,6 @@ static struct inet_protosw dccp_v6_protosw = {
 	.protocol	= IPPROTO_DCCP,
 	.prot		= &dccp_v6_prot,
 	.ops		= &inet6_dccp_ops,
-	.capability	= -1,
 	.flags		= INET_PROTOSW_ICSK,
 };
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 538e84d0bcba..180ec4c94919 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -325,7 +325,7 @@ lookup_protocol:
 	}
 
 	err = -EPERM;
-	if (answer->capability > 0 && !capable(answer->capability))
+	if (sock->type == SOCK_RAW && !capable(CAP_NET_RAW))
 		goto out_rcu_unlock;
 
 	err = -EAFNOSUPPORT;
@@ -947,7 +947,6 @@ static struct inet_protosw inetsw_array[] =
 		.protocol =   IPPROTO_TCP,
 		.prot =       &tcp_prot,
 		.ops =        &inet_stream_ops,
-		.capability = -1,
 		.no_check =   0,
 		.flags =      INET_PROTOSW_PERMANENT |
 			      INET_PROTOSW_ICSK,
@@ -958,7 +957,6 @@ static struct inet_protosw inetsw_array[] =
 		.protocol =   IPPROTO_UDP,
 		.prot =       &udp_prot,
 		.ops =        &inet_dgram_ops,
-		.capability = -1,
 		.no_check =   UDP_CSUM_DEFAULT,
 		.flags =      INET_PROTOSW_PERMANENT,
        },
@@ -969,7 +967,6 @@ static struct inet_protosw inetsw_array[] =
 	       .protocol =   IPPROTO_IP,	/* wild card */
 	       .prot =       &raw_prot,
 	       .ops =        &inet_sockraw_ops,
-	       .capability = CAP_NET_RAW,
 	       .no_check =   UDP_CSUM_DEFAULT,
 	       .flags =      INET_PROTOSW_REUSE,
        }
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 470c504b9554..66f79513f4a5 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -64,7 +64,6 @@ static struct inet_protosw udplite4_protosw = {
 	.protocol	=  IPPROTO_UDPLITE,
 	.prot		=  &udplite_prot,
 	.ops		=  &inet_dgram_ops,
-	.capability	= -1,
 	.no_check	=  0,		/* must checksum (RFC 3828) */
 	.flags		=  INET_PROTOSW_PERMANENT,
 };
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 9105b25defe5..1b3889356599 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -158,7 +158,7 @@ lookup_protocol:
 	}
 
 	err = -EPERM;
-	if (answer->capability > 0 && !capable(answer->capability))
+	if (sock->type == SOCK_RAW && !capable(CAP_NET_RAW))
 		goto out_rcu_unlock;
 
 	sock->ops = answer->ops;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index cb834ab7f071..818ef21ba76d 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -1336,7 +1336,6 @@ static struct inet_protosw rawv6_protosw = {
 	.protocol	= IPPROTO_IP,	/* wild card */
 	.prot		= &rawv6_prot,
 	.ops		= &inet6_sockraw_ops,
-	.capability	= CAP_NET_RAW,
 	.no_check	= UDP_CSUM_DEFAULT,
 	.flags		= INET_PROTOSW_REUSE,
 };
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 34925f089e07..696a22f034e8 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2112,7 +2112,6 @@ static struct inet_protosw tcpv6_protosw = {
 	.protocol	=	IPPROTO_TCP,
 	.prot		=	&tcpv6_prot,
 	.ops		=	&inet6_stream_ops,
-	.capability	=	-1,
 	.no_check	=	0,
 	.flags		=	INET_PROTOSW_PERMANENT |
 				INET_PROTOSW_ICSK,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index d3b59d73f507..bbe2f3e445fc 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1286,7 +1286,6 @@ static struct inet_protosw udpv6_protosw = {
 	.protocol =  IPPROTO_UDP,
 	.prot =      &udpv6_prot,
 	.ops =       &inet6_dgram_ops,
-	.capability =-1,
 	.no_check =  UDP_CSUM_DEFAULT,
 	.flags =     INET_PROTOSW_PERMANENT,
 };
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index d737a27ee010..6ea6938919e6 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -62,7 +62,6 @@ static struct inet_protosw udplite6_protosw = {
 	.protocol	= IPPROTO_UDPLITE,
 	.prot		= &udplitev6_prot,
 	.ops		= &inet6_dgram_ops,
-	.capability	= -1,
 	.no_check	= 0,
 	.flags		= INET_PROTOSW_PERMANENT,
 };
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index bb280e60e00a..bacd6a7318be 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -930,7 +930,6 @@ static struct inet_protosw sctpv6_seqpacket_protosw = {
 	.protocol      = IPPROTO_SCTP,
 	.prot 	       = &sctpv6_prot,
 	.ops           = &inet6_seqpacket_ops,
-	.capability    = -1,
 	.no_check      = 0,
 	.flags         = SCTP_PROTOSW_FLAG
 };
@@ -939,7 +938,6 @@ static struct inet_protosw sctpv6_stream_protosw = {
 	.protocol      = IPPROTO_SCTP,
 	.prot 	       = &sctpv6_prot,
 	.ops           = &inet6_seqpacket_ops,
-	.capability    = -1,
 	.no_check      = 0,
 	.flags         = SCTP_PROTOSW_FLAG,
 };
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index fe44c57101de..08ef203d36ac 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -909,7 +909,6 @@ static struct inet_protosw sctp_seqpacket_protosw = {
 	.protocol   = IPPROTO_SCTP,
 	.prot       = &sctp_prot,
 	.ops        = &inet_seqpacket_ops,
-	.capability = -1,
 	.no_check   = 0,
 	.flags      = SCTP_PROTOSW_FLAG
 };
@@ -918,7 +917,6 @@ static struct inet_protosw sctp_stream_protosw = {
 	.protocol   = IPPROTO_SCTP,
 	.prot       = &sctp_prot,
 	.ops        = &inet_seqpacket_ops,
-	.capability = -1,
 	.no_check   = 0,
 	.flags      = SCTP_PROTOSW_FLAG
 };
-- 
cgit v1.2.3


From 3f378b684453f2a028eda463ce383370545d9cc9 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 5 Nov 2009 22:18:14 -0800
Subject: net: pass kern to net_proto_family create function

The generic __sock_create function has a kern argument which allows the
security system to make decisions based on if a socket is being created by
the kernel or by userspace.  This patch passes that flag to the
net_proto_family specific create function, so it can do the same thing.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/isdn/mISDN/socket.c    | 2 +-
 drivers/net/pppox.c            | 3 ++-
 include/linux/net.h            | 3 ++-
 net/appletalk/ddp.c            | 3 ++-
 net/atm/pvc.c                  | 3 ++-
 net/atm/svc.c                  | 7 ++++---
 net/ax25/af_ax25.c             | 3 ++-
 net/bluetooth/af_bluetooth.c   | 5 +++--
 net/bluetooth/bnep/sock.c      | 3 ++-
 net/bluetooth/cmtp/sock.c      | 3 ++-
 net/bluetooth/hci_sock.c       | 3 ++-
 net/bluetooth/hidp/sock.c      | 3 ++-
 net/bluetooth/l2cap.c          | 3 ++-
 net/bluetooth/rfcomm/sock.c    | 3 ++-
 net/bluetooth/sco.c            | 3 ++-
 net/can/af_can.c               | 3 ++-
 net/decnet/af_decnet.c         | 3 ++-
 net/econet/af_econet.c         | 3 ++-
 net/ieee802154/af_ieee802154.c | 2 +-
 net/ipv4/af_inet.c             | 3 ++-
 net/ipv6/af_inet6.c            | 3 ++-
 net/ipx/af_ipx.c               | 3 ++-
 net/irda/af_irda.c             | 7 ++++---
 net/iucv/af_iucv.c             | 3 ++-
 net/key/af_key.c               | 3 ++-
 net/llc/af_llc.c               | 5 ++++-
 net/netlink/af_netlink.c       | 3 ++-
 net/netrom/af_netrom.c         | 3 ++-
 net/packet/af_packet.c         | 3 ++-
 net/phonet/af_phonet.c         | 3 ++-
 net/rds/af_rds.c               | 3 ++-
 net/rose/af_rose.c             | 3 ++-
 net/rxrpc/af_rxrpc.c           | 3 ++-
 net/socket.c                   | 2 +-
 net/tipc/socket.c              | 6 ++++--
 net/unix/af_unix.c             | 3 ++-
 net/x25/af_x25.c               | 3 ++-
 37 files changed, 80 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index 28182ed8dea1..fcfe17a19a61 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -779,7 +779,7 @@ base_sock_create(struct net *net, struct socket *sock, int protocol)
 }
 
 static int
-mISDN_sock_create(struct net *net, struct socket *sock, int proto)
+mISDN_sock_create(struct net *net, struct socket *sock, int proto, int kern)
 {
 	int err = -EPROTONOSUPPORT;
 
diff --git a/drivers/net/pppox.c b/drivers/net/pppox.c
index c14ee24c05a8..ac806b27c658 100644
--- a/drivers/net/pppox.c
+++ b/drivers/net/pppox.c
@@ -104,7 +104,8 @@ int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 
 EXPORT_SYMBOL(pppox_ioctl);
 
-static int pppox_create(struct net *net, struct socket *sock, int protocol)
+static int pppox_create(struct net *net, struct socket *sock, int protocol,
+			int kern)
 {
 	int rc = -EPROTOTYPE;
 
diff --git a/include/linux/net.h b/include/linux/net.h
index 4da9d571b053..70ee3c310f15 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -204,7 +204,8 @@ struct proto_ops {
 
 struct net_proto_family {
 	int		family;
-	int		(*create)(struct net *net, struct socket *sock, int protocol);
+	int		(*create)(struct net *net, struct socket *sock,
+				  int protocol, int kern);
 	struct module	*owner;
 };
 
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index abe38014b7fd..4b0ce2e2b46e 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1021,7 +1021,8 @@ static struct proto ddp_proto = {
  * Create a socket. Initialise the socket, blank the addresses
  * set the state.
  */
-static int atalk_create(struct net *net, struct socket *sock, int protocol)
+static int atalk_create(struct net *net, struct socket *sock, int protocol,
+			int kern)
 {
 	struct sock *sk;
 	int rc = -ESOCKTNOSUPPORT;
diff --git a/net/atm/pvc.c b/net/atm/pvc.c
index a6e1fdbae87f..8d74e62b0d79 100644
--- a/net/atm/pvc.c
+++ b/net/atm/pvc.c
@@ -127,7 +127,8 @@ static const struct proto_ops pvc_proto_ops = {
 };
 
 
-static int pvc_create(struct net *net, struct socket *sock,int protocol)
+static int pvc_create(struct net *net, struct socket *sock, int protocol,
+		      int kern)
 {
 	if (net != &init_net)
 		return -EAFNOSUPPORT;
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 819354233318..c7395070ee78 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -25,7 +25,7 @@
 #include "signaling.h"
 #include "addr.h"
 
-static int svc_create(struct net *net, struct socket *sock,int protocol);
+static int svc_create(struct net *net, struct socket *sock, int protocol, int kern);
 
 /*
  * Note: since all this is still nicely synchronized with the signaling demon,
@@ -330,7 +330,7 @@ static int svc_accept(struct socket *sock,struct socket *newsock,int flags)
 
 	lock_sock(sk);
 
-	error = svc_create(sock_net(sk), newsock,0);
+	error = svc_create(sock_net(sk), newsock, 0, 0);
 	if (error)
 		goto out;
 
@@ -650,7 +650,8 @@ static const struct proto_ops svc_proto_ops = {
 };
 
 
-static int svc_create(struct net *net, struct socket *sock,int protocol)
+static int svc_create(struct net *net, struct socket *sock, int protocol,
+		      int kern)
 {
 	int error;
 
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index f1e998b2796e..d6ddfa4c4471 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -799,7 +799,8 @@ static struct proto ax25_proto = {
 	.obj_size = sizeof(struct sock),
 };
 
-static int ax25_create(struct net *net, struct socket *sock, int protocol)
+static int ax25_create(struct net *net, struct socket *sock, int protocol,
+		       int kern)
 {
 	struct sock *sk;
 	ax25_cb *ax25;
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 399e59c9c6cb..087cc51f5927 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -126,7 +126,8 @@ int bt_sock_unregister(int proto)
 }
 EXPORT_SYMBOL(bt_sock_unregister);
 
-static int bt_sock_create(struct net *net, struct socket *sock, int proto)
+static int bt_sock_create(struct net *net, struct socket *sock, int proto,
+			  int kern)
 {
 	int err;
 
@@ -144,7 +145,7 @@ static int bt_sock_create(struct net *net, struct socket *sock, int proto)
 	read_lock(&bt_proto_lock);
 
 	if (bt_proto[proto] && try_module_get(bt_proto[proto]->owner)) {
-		err = bt_proto[proto]->create(net, sock, proto);
+		err = bt_proto[proto]->create(net, sock, proto, kern);
 		bt_sock_reclassify_lock(sock, proto);
 		module_put(bt_proto[proto]->owner);
 	}
diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c
index 0a2c5460bb48..2ff6ac7b2ed4 100644
--- a/net/bluetooth/bnep/sock.c
+++ b/net/bluetooth/bnep/sock.c
@@ -195,7 +195,8 @@ static struct proto bnep_proto = {
 	.obj_size	= sizeof(struct bt_sock)
 };
 
-static int bnep_sock_create(struct net *net, struct socket *sock, int protocol)
+static int bnep_sock_create(struct net *net, struct socket *sock, int protocol,
+			    int kern)
 {
 	struct sock *sk;
 
diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c
index de7c8040bc56..978cc3a718ad 100644
--- a/net/bluetooth/cmtp/sock.c
+++ b/net/bluetooth/cmtp/sock.c
@@ -190,7 +190,8 @@ static struct proto cmtp_proto = {
 	.obj_size	= sizeof(struct bt_sock)
 };
 
-static int cmtp_sock_create(struct net *net, struct socket *sock, int protocol)
+static int cmtp_sock_create(struct net *net, struct socket *sock, int protocol,
+			    int kern)
 {
 	struct sock *sk;
 
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index e7395f231989..1ca5c7ca9bd4 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -621,7 +621,8 @@ static struct proto hci_sk_proto = {
 	.obj_size	= sizeof(struct hci_pinfo)
 };
 
-static int hci_sock_create(struct net *net, struct socket *sock, int protocol)
+static int hci_sock_create(struct net *net, struct socket *sock, int protocol,
+			   int kern)
 {
 	struct sock *sk;
 
diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c
index 4beb6a7a2953..9cfef68b9fec 100644
--- a/net/bluetooth/hidp/sock.c
+++ b/net/bluetooth/hidp/sock.c
@@ -241,7 +241,8 @@ static struct proto hidp_proto = {
 	.obj_size	= sizeof(struct bt_sock)
 };
 
-static int hidp_sock_create(struct net *net, struct socket *sock, int protocol)
+static int hidp_sock_create(struct net *net, struct socket *sock, int protocol,
+			    int kern)
 {
 	struct sock *sk;
 
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index d65101d92ee5..365ae161d702 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -819,7 +819,8 @@ static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, int p
 	return sk;
 }
 
-static int l2cap_sock_create(struct net *net, struct socket *sock, int protocol)
+static int l2cap_sock_create(struct net *net, struct socket *sock, int protocol,
+			     int kern)
 {
 	struct sock *sk;
 
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index d3bfc1b0afb1..4b5968dda673 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -323,7 +323,8 @@ static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int
 	return sk;
 }
 
-static int rfcomm_sock_create(struct net *net, struct socket *sock, int protocol)
+static int rfcomm_sock_create(struct net *net, struct socket *sock,
+			      int protocol, int kern)
 {
 	struct sock *sk;
 
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 694a65541b73..dd8f6ec57dce 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -430,7 +430,8 @@ static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, int pro
 	return sk;
 }
 
-static int sco_sock_create(struct net *net, struct socket *sock, int protocol)
+static int sco_sock_create(struct net *net, struct socket *sock, int protocol,
+			   int kern)
 {
 	struct sock *sk;
 
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 9c0426dc3184..833bd838edc6 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -114,7 +114,8 @@ static void can_sock_destruct(struct sock *sk)
 	skb_queue_purge(&sk->sk_receive_queue);
 }
 
-static int can_create(struct net *net, struct socket *sock, int protocol)
+static int can_create(struct net *net, struct socket *sock, int protocol,
+		      int kern)
 {
 	struct sock *sk;
 	struct can_proto *cp;
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 2e355841ca99..9ade3a6de954 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -675,7 +675,8 @@ char *dn_addr2asc(__u16 addr, char *buf)
 
 
-static int dn_create(struct net *net, struct socket *sock, int protocol)
+static int dn_create(struct net *net, struct socket *sock, int protocol,
+		     int kern)
 {
 	struct sock *sk;
 
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index 5e9426a11c3e..596679803de5 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -605,7 +605,8 @@ static struct proto econet_proto = {
  *	Create an Econet socket
  */
 
-static int econet_create(struct net *net, struct socket *sock, int protocol)
+static int econet_create(struct net *net, struct socket *sock, int protocol,
+			 int kern)
 {
 	struct sock *sk;
 	struct econet_sock *eo;
diff --git a/net/ieee802154/af_ieee802154.c b/net/ieee802154/af_ieee802154.c
index 309348fba72b..de6e34d2a7f8 100644
--- a/net/ieee802154/af_ieee802154.c
+++ b/net/ieee802154/af_ieee802154.c
@@ -234,7 +234,7 @@ static const struct proto_ops ieee802154_dgram_ops = {
  * set the state.
  */
 static int ieee802154_create(struct net *net, struct socket *sock,
-		int protocol)
+			     int protocol, int kern)
 {
 	struct sock *sk;
 	int rc;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 180ec4c94919..5c7e42c02afb 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -262,7 +262,8 @@ static inline int inet_netns_ok(struct net *net, int protocol)
  *	Create an inet socket.
  */
 
-static int inet_create(struct net *net, struct socket *sock, int protocol)
+static int inet_create(struct net *net, struct socket *sock, int protocol,
+		       int kern)
 {
 	struct sock *sk;
 	struct inet_protosw *answer;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 1b3889356599..45ed5e05ab32 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -95,7 +95,8 @@ static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
 	return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
 }
 
-static int inet6_create(struct net *net, struct socket *sock, int protocol)
+static int inet6_create(struct net *net, struct socket *sock, int protocol,
+			int kern)
 {
 	struct inet_sock *inet;
 	struct ipv6_pinfo *np;
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 6481ee4bdf72..96d193a24415 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -1352,7 +1352,8 @@ static struct proto ipx_proto = {
 	.obj_size = sizeof(struct ipx_sock),
 };
 
-static int ipx_create(struct net *net, struct socket *sock, int protocol)
+static int ipx_create(struct net *net, struct socket *sock, int protocol,
+		      int kern)
 {
 	int rc = -ESOCKTNOSUPPORT;
 	struct sock *sk;
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 9429e4002bca..e73a0016c0aa 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -61,7 +61,7 @@
 
 #include <net/irda/af_irda.h>
 
-static int irda_create(struct net *net, struct socket *sock, int protocol);
+static int irda_create(struct net *net, struct socket *sock, int protocol, int kern);
 
 static const struct proto_ops irda_stream_ops;
 static const struct proto_ops irda_seqpacket_ops;
@@ -839,7 +839,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
 
 	IRDA_DEBUG(2, "%s()\n", __func__);
 
-	err = irda_create(sock_net(sk), newsock, sk->sk_protocol);
+	err = irda_create(sock_net(sk), newsock, sk->sk_protocol, 0);
 	if (err)
 		return err;
 
@@ -1062,7 +1062,8 @@ static struct proto irda_proto = {
  *    Create IrDA socket
  *
  */
-static int irda_create(struct net *net, struct socket *sock, int protocol)
+static int irda_create(struct net *net, struct socket *sock, int protocol,
+		       int kern)
 {
 	struct sock *sk;
 	struct irda_sock *self;
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 3aebabb158a8..1e428863574f 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -481,7 +481,8 @@ static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio)
 }
 
 /* Create an IUCV socket */
-static int iucv_sock_create(struct net *net, struct socket *sock, int protocol)
+static int iucv_sock_create(struct net *net, struct socket *sock, int protocol,
+			    int kern)
 {
 	struct sock *sk;
 
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 472f6594184a..86b2c22d0918 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -177,7 +177,8 @@ static struct proto key_proto = {
 	.obj_size = sizeof(struct pfkey_sock),
 };
 
-static int pfkey_create(struct net *net, struct socket *sock, int protocol)
+static int pfkey_create(struct net *net, struct socket *sock, int protocol,
+			int kern)
 {
 	struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
 	struct sock *sk;
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 4866b4fb0c27..5266c286b260 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -140,14 +140,17 @@ static struct proto llc_proto = {
 
 /**
  *	llc_ui_create - alloc and init a new llc_ui socket
+ *	@net: network namespace (must be default network)
  *	@sock: Socket to initialize and attach allocated sk to.
  *	@protocol: Unused.
+ *	@kern: on behalf of kernel or userspace
  *
  *	Allocate and initialize a new llc_ui socket, validate the user wants a
  *	socket type we have available.
  *	Returns 0 upon success, negative upon failure.
  */
-static int llc_ui_create(struct net *net, struct socket *sock, int protocol)
+static int llc_ui_create(struct net *net, struct socket *sock, int protocol,
+			 int kern)
 {
 	struct sock *sk;
 	int rc = -ESOCKTNOSUPPORT;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 0cd2d8829313..aea805c98da3 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -428,7 +428,8 @@ static int __netlink_create(struct net *net, struct socket *sock,
 	return 0;
 }
 
-static int netlink_create(struct net *net, struct socket *sock, int protocol)
+static int netlink_create(struct net *net, struct socket *sock, int protocol,
+			  int kern)
 {
 	struct module *module = NULL;
 	struct mutex *cb_mutex;
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 281fa597cae5..4bdd5697f63b 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -425,7 +425,8 @@ static struct proto nr_proto = {
 	.obj_size = sizeof(struct nr_sock),
 };
 
-static int nr_create(struct net *net, struct socket *sock, int protocol)
+static int nr_create(struct net *net, struct socket *sock, int protocol,
+		     int kern)
 {
 	struct sock *sk;
 	struct nr_sock *nr;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 91d246d34780..3304caa65347 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1344,7 +1344,8 @@ static struct proto packet_proto = {
  *	Create a packet of type SOCK_PACKET.
  */
 
-static int packet_create(struct net *net, struct socket *sock, int protocol)
+static int packet_create(struct net *net, struct socket *sock, int protocol,
+			 int kern)
 {
 	struct sock *sk;
 	struct packet_sock *po;
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index 66737aa995ea..3bd1be6b26f0 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -60,7 +60,8 @@ static inline void phonet_proto_put(struct phonet_protocol *pp)
 
 /* protocol family functions */
 
-static int pn_socket_create(struct net *net, struct socket *sock, int protocol)
+static int pn_socket_create(struct net *net, struct socket *sock, int protocol,
+			    int kern)
 {
 	struct sock *sk;
 	struct pn_sock *pn;
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 2b978dc6e75d..e25d8d5ce8df 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -410,7 +410,8 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
 	return 0;
 }
 
-static int rds_create(struct net *net, struct socket *sock, int protocol)
+static int rds_create(struct net *net, struct socket *sock, int protocol,
+		      int kern)
 {
 	struct sock *sk;
 
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index c17734c2ce89..4de4287fec37 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -512,7 +512,8 @@ static struct proto rose_proto = {
 	.obj_size = sizeof(struct rose_sock),
 };
 
-static int rose_create(struct net *net, struct socket *sock, int protocol)
+static int rose_create(struct net *net, struct socket *sock, int protocol,
+		       int kern)
 {
 	struct sock *sk;
 	struct rose_sock *rose;
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 6817c9781ef3..f978d02a248a 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -608,7 +608,8 @@ static unsigned int rxrpc_poll(struct file *file, struct socket *sock,
 /*
  * create an RxRPC socket
  */
-static int rxrpc_create(struct net *net, struct socket *sock, int protocol)
+static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
+			int kern)
 {
 	struct rxrpc_sock *rx;
 	struct sock *sk;
diff --git a/net/socket.c b/net/socket.c
index 9dff31c9b799..4f3e0f0c156b 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1252,7 +1252,7 @@ static int __sock_create(struct net *net, int family, int type, int protocol,
 	/* Now protected by module ref count */
 	rcu_read_unlock();
 
-	err = pf->create(net, sock, protocol);
+	err = pf->create(net, sock, protocol, kern);
 	if (err < 0)
 		goto out_module_put;
 
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index e6d9abf7440e..d00c2119faf3 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -177,6 +177,7 @@ static void reject_rx_queue(struct sock *sk)
  * @net: network namespace (must be default network)
  * @sock: pre-allocated socket structure
  * @protocol: protocol indicator (must be 0)
+ * @kern: caused by kernel or by userspace?
  *
  * This routine creates additional data structures used by the TIPC socket,
  * initializes them, and links them together.
@@ -184,7 +185,8 @@ static void reject_rx_queue(struct sock *sk)
  * Returns 0 on success, errno otherwise
  */
 
-static int tipc_create(struct net *net, struct socket *sock, int protocol)
+static int tipc_create(struct net *net, struct socket *sock, int protocol,
+		       int kern)
 {
 	const struct proto_ops *ops;
 	socket_state state;
@@ -1528,7 +1530,7 @@ static int accept(struct socket *sock, struct socket *new_sock, int flags)
 
 	buf = skb_peek(&sk->sk_receive_queue);
 
-	res = tipc_create(sock_net(sock->sk), new_sock, 0);
+	res = tipc_create(sock_net(sock->sk), new_sock, 0, 0);
 	if (!res) {
 		struct sock *new_sk = new_sock->sk;
 		struct tipc_sock *new_tsock = tipc_sk(new_sk);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 3291902f0b88..178d3af2a605 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -621,7 +621,8 @@ out:
 	return sk;
 }
 
-static int unix_create(struct net *net, struct socket *sock, int protocol)
+static int unix_create(struct net *net, struct socket *sock, int protocol,
+		       int kern)
 {
 	if (protocol && protocol != PF_UNIX)
 		return -EPROTONOSUPPORT;
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index e19d811788a5..38e235f61e27 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -501,7 +501,8 @@ out:
 	return sk;
 }
 
-static int x25_create(struct net *net, struct socket *sock, int protocol)
+static int x25_create(struct net *net, struct socket *sock, int protocol,
+		      int kern)
 {
 	struct sock *sk;
 	struct x25_sock *x25;
-- 
cgit v1.2.3


From 1eaa9d03d3ee9156c8c405b006ce892ae28290ad Mon Sep 17 00:00:00 2001
From: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Date: Tue, 15 Sep 2009 17:04:44 +0400
Subject: ieee802154: add LIST_PHY command support

Add nl802154 command to get information about PHY's present in
the system.

Signed-off-by: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
---
 include/linux/nl802154.h   |   4 +
 net/ieee802154/Makefile    |   2 +-
 net/ieee802154/netlink.c   |   4 +
 net/ieee802154/nl-phy.c    | 188 +++++++++++++++++++++++++++++++++++++++++++++
 net/ieee802154/nl_policy.c |   2 +
 5 files changed, 199 insertions(+), 1 deletion(-)
 create mode 100644 net/ieee802154/nl-phy.c

(limited to 'include/linux')

diff --git a/include/linux/nl802154.h b/include/linux/nl802154.h
index b7d9435d5a9f..275fd94f4727 100644
--- a/include/linux/nl802154.h
+++ b/include/linux/nl802154.h
@@ -65,6 +65,9 @@ enum {
 	IEEE802154_ATTR_SEC,
 
 	IEEE802154_ATTR_PAGE,
+	IEEE802154_ATTR_CHANNEL_PAGE_LIST,
+
+	IEEE802154_ATTR_PHY_NAME,
 
 	__IEEE802154_ATTR_MAX,
 };
@@ -114,6 +117,7 @@ enum {
 	IEEE802154_RX_ENABLE_CONF, /* Not supported yet */
 
 	IEEE802154_LIST_IFACE,
+	IEEE802154_LIST_PHY,
 
 	__IEEE802154_CMD_MAX,
 };
diff --git a/net/ieee802154/Makefile b/net/ieee802154/Makefile
index 69af9f6a404b..ce2d33582859 100644
--- a/net/ieee802154/Makefile
+++ b/net/ieee802154/Makefile
@@ -1,5 +1,5 @@
 obj-$(CONFIG_IEEE802154) +=	ieee802154.o af_802154.o
-ieee802154-y		:= netlink.o nl-mac.o nl_policy.o wpan-class.o
+ieee802154-y		:= netlink.o nl-mac.o nl-phy.o nl_policy.o wpan-class.o
 af_802154-y		:= af_ieee802154.o raw.o dgram.o
 
 ccflags-y += -Wall -DDEBUG
diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c
index 5b738ec7d9f1..8a221737f75c 100644
--- a/net/ieee802154/netlink.c
+++ b/net/ieee802154/netlink.c
@@ -87,6 +87,10 @@ int __init ieee802154_nl_init(void)
 	if (rc)
 		goto fail;
 
+	rc = nl802154_phy_register();
+	if (rc)
+		goto fail;
+
 	return 0;
 
 fail:
diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c
new file mode 100644
index 000000000000..b7af722eb784
--- /dev/null
+++ b/net/ieee802154/nl-phy.c
@@ -0,0 +1,188 @@
+/*
+ * Netlink inteface for IEEE 802.15.4 stack
+ *
+ * Copyright 2007, 2008 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Written by:
+ * Sergey Lapin <slapin@ossfans.org>
+ * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
+ * Maxim Osipov <maxim.osipov@siemens.com>
+ */
+
+#include <linux/kernel.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/wpan-phy.h>
+#include <linux/nl802154.h>
+
+#include "ieee802154.h"
+
+static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 pid,
+	u32 seq, int flags, struct wpan_phy *phy)
+{
+	void *hdr;
+	int i, pages = 0;
+	uint32_t *buf = kzalloc(32 * sizeof(uint32_t), GFP_KERNEL);
+
+	pr_debug("%s\n", __func__);
+
+	if (!buf)
+		goto out;
+
+	hdr = genlmsg_put(msg, 0, seq, &nl802154_family, flags,
+		IEEE802154_LIST_PHY);
+	if (!hdr)
+		goto out;
+
+	mutex_lock(&phy->pib_lock);
+	NLA_PUT_STRING(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy));
+
+	NLA_PUT_U8(msg, IEEE802154_ATTR_PAGE, phy->current_page);
+	NLA_PUT_U8(msg, IEEE802154_ATTR_CHANNEL, phy->current_channel);
+	for (i = 0; i < 32; i++) {
+		if (phy->channels_supported[i])
+			buf[pages++] = phy->channels_supported[i] | (i << 27);
+	}
+	if (pages)
+		NLA_PUT(msg, IEEE802154_ATTR_CHANNEL_PAGE_LIST,
+				pages * sizeof(uint32_t), buf);
+
+	mutex_unlock(&phy->pib_lock);
+	return genlmsg_end(msg, hdr);
+
+nla_put_failure:
+	mutex_unlock(&phy->pib_lock);
+	genlmsg_cancel(msg, hdr);
+out:
+	kfree(buf);
+	return -EMSGSIZE;
+}
+
+static int ieee802154_list_phy(struct sk_buff *skb,
+	struct genl_info *info)
+{
+	/* Request for interface name, index, type, IEEE address,
+	   PAN Id, short address */
+	struct sk_buff *msg;
+	struct wpan_phy *phy;
+	const char *name;
+	int rc = -ENOBUFS;
+
+	pr_debug("%s\n", __func__);
+
+	if (!info->attrs[IEEE802154_ATTR_PHY_NAME])
+		return -EINVAL;
+
+	name = nla_data(info->attrs[IEEE802154_ATTR_PHY_NAME]);
+	if (name[nla_len(info->attrs[IEEE802154_ATTR_PHY_NAME]) - 1] != '\0')
+		return -EINVAL; /* phy name should be null-terminated */
+
+
+	phy = wpan_phy_find(name);
+	if (!phy)
+		return -ENODEV;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		goto out_dev;
+
+	rc = ieee802154_nl_fill_phy(msg, info->snd_pid, info->snd_seq,
+			0, phy);
+	if (rc < 0)
+		goto out_free;
+
+	wpan_phy_put(phy);
+
+	return genlmsg_reply(msg, info);
+out_free:
+	nlmsg_free(msg);
+out_dev:
+	wpan_phy_put(phy);
+	return rc;
+
+}
+
+struct dump_phy_data {
+	struct sk_buff *skb;
+	struct netlink_callback *cb;
+	int idx, s_idx;
+};
+
+static int ieee802154_dump_phy_iter(struct wpan_phy *phy, void *_data)
+{
+	int rc;
+	struct dump_phy_data *data = _data;
+
+	pr_debug("%s\n", __func__);
+
+	if (data->idx++ < data->s_idx)
+		return 0;
+
+	rc = ieee802154_nl_fill_phy(data->skb,
+			NETLINK_CB(data->cb->skb).pid,
+			data->cb->nlh->nlmsg_seq,
+			NLM_F_MULTI,
+			phy);
+
+	if (rc < 0) {
+		data->idx--;
+		return rc;
+	}
+
+	return 0;
+}
+
+static int ieee802154_dump_phy(struct sk_buff *skb,
+	struct netlink_callback *cb)
+{
+	struct dump_phy_data data = {
+		.cb = cb,
+		.skb = skb,
+		.s_idx = cb->args[0],
+		.idx = 0,
+	};
+
+	pr_debug("%s\n", __func__);
+
+	wpan_phy_for_each(ieee802154_dump_phy_iter, &data);
+
+	cb->args[0] = data.idx;
+
+	return skb->len;
+}
+
+static struct genl_ops ieee802154_phy_ops[] = {
+	IEEE802154_DUMP(IEEE802154_LIST_PHY, ieee802154_list_phy,
+							ieee802154_dump_phy),
+};
+
+/*
+ * No need to unregister as family unregistration will do it.
+ */
+int nl802154_phy_register(void)
+{
+	int i;
+	int rc;
+
+	for (i = 0; i < ARRAY_SIZE(ieee802154_phy_ops); i++) {
+		rc = genl_register_ops(&nl802154_family,
+				&ieee802154_phy_ops[i]);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
diff --git a/net/ieee802154/nl_policy.c b/net/ieee802154/nl_policy.c
index 2363ebee02e7..6adda4d46f95 100644
--- a/net/ieee802154/nl_policy.c
+++ b/net/ieee802154/nl_policy.c
@@ -27,6 +27,7 @@
 const struct nla_policy ieee802154_policy[IEEE802154_ATTR_MAX + 1] = {
 	[IEEE802154_ATTR_DEV_NAME] = { .type = NLA_STRING, },
 	[IEEE802154_ATTR_DEV_INDEX] = { .type = NLA_U32, },
+	[IEEE802154_ATTR_PHY_NAME] = { .type = NLA_STRING, },
 
 	[IEEE802154_ATTR_STATUS] = { .type = NLA_U8, },
 	[IEEE802154_ATTR_SHORT_ADDR] = { .type = NLA_U16, },
@@ -50,5 +51,6 @@ const struct nla_policy ieee802154_policy[IEEE802154_ATTR_MAX + 1] = {
 	[IEEE802154_ATTR_CHANNELS] = { .type = NLA_U32, },
 	[IEEE802154_ATTR_DURATION] = { .type = NLA_U8, },
 	[IEEE802154_ATTR_ED_LIST] = { .len = 27 },
+	[IEEE802154_ATTR_CHANNEL_PAGE_LIST] = { .len = 32 * 4, },
 };
 
-- 
cgit v1.2.3


From bb1cafb8fc414d6dbe933f888df6540c2ef02101 Mon Sep 17 00:00:00 2001
From: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Date: Thu, 5 Nov 2009 16:56:23 +0300
Subject: ieee802154: add support for creation/removal of logic interfaces

Add support for two more NL802154 commands: ADD_IFACE and DEL_IFACE,
thus allowing creation and removal of logic WPAN interfaces on the top
of wpan-phy.

Signed-off-by: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
---
 include/linux/nl802154.h |   2 +
 include/net/wpan-phy.h   |   4 ++
 net/ieee802154/nl-phy.c  | 156 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 162 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nl802154.h b/include/linux/nl802154.h
index 275fd94f4727..33d9f5175109 100644
--- a/include/linux/nl802154.h
+++ b/include/linux/nl802154.h
@@ -118,6 +118,8 @@ enum {
 
 	IEEE802154_LIST_IFACE,
 	IEEE802154_LIST_PHY,
+	IEEE802154_ADD_IFACE,
+	IEEE802154_DEL_IFACE,
 
 	__IEEE802154_CMD_MAX,
 };
diff --git a/include/net/wpan-phy.h b/include/net/wpan-phy.h
index a65e98509fbc..85926231c07a 100644
--- a/include/net/wpan-phy.h
+++ b/include/net/wpan-phy.h
@@ -41,6 +41,10 @@ struct wpan_phy {
 	struct device dev;
 	int idx;
 
+	struct net_device *(*add_iface)(struct wpan_phy *phy,
+			const char *name);
+	void (*del_iface)(struct wpan_phy *phy, struct net_device *dev);
+
 	char priv[0] __attribute__((__aligned__(NETDEV_ALIGN)));
 };
 
diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c
index b7af722eb784..199a2d9d12f9 100644
--- a/net/ieee802154/nl-phy.c
+++ b/net/ieee802154/nl-phy.c
@@ -26,6 +26,9 @@
 #include <net/netlink.h>
 #include <net/genetlink.h>
 #include <net/wpan-phy.h>
+#include <net/af_ieee802154.h>
+#include <net/ieee802154_netdev.h>
+#include <net/rtnetlink.h> /* for rtnl_{un,}lock */
 #include <linux/nl802154.h>
 
 #include "ieee802154.h"
@@ -164,9 +167,162 @@ static int ieee802154_dump_phy(struct sk_buff *skb,
 	return skb->len;
 }
 
+static int ieee802154_add_iface(struct sk_buff *skb,
+		struct genl_info *info)
+{
+	struct sk_buff *msg;
+	struct wpan_phy *phy;
+	const char *name;
+	const char *devname;
+	int rc = -ENOBUFS;
+	struct net_device *dev;
+
+	pr_debug("%s\n", __func__);
+
+	if (!info->attrs[IEEE802154_ATTR_PHY_NAME])
+		return -EINVAL;
+
+	name = nla_data(info->attrs[IEEE802154_ATTR_PHY_NAME]);
+	if (name[nla_len(info->attrs[IEEE802154_ATTR_PHY_NAME]) - 1] != '\0')
+		return -EINVAL; /* phy name should be null-terminated */
+
+	if (info->attrs[IEEE802154_ATTR_DEV_NAME]) {
+		devname = nla_data(info->attrs[IEEE802154_ATTR_DEV_NAME]);
+		if (devname[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1]
+				!= '\0')
+			return -EINVAL; /* phy name should be null-terminated */
+	} else  {
+		devname = "wpan%d";
+	}
+
+	if (strlen(devname) >= IFNAMSIZ)
+		return -ENAMETOOLONG;
+
+	phy = wpan_phy_find(name);
+	if (!phy)
+		return -ENODEV;
+
+	msg = ieee802154_nl_new_reply(info, 0, IEEE802154_ADD_IFACE);
+	if (!msg)
+		goto out_dev;
+
+	if (!phy->add_iface) {
+		rc = -EINVAL;
+		goto nla_put_failure;
+	}
+
+	dev = phy->add_iface(phy, devname);
+	if (IS_ERR(dev)) {
+		rc = PTR_ERR(dev);
+		goto nla_put_failure;
+	}
+
+	NLA_PUT_STRING(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy));
+	NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, dev->name);
+
+	dev_put(dev);
+
+	wpan_phy_put(phy);
+
+	return ieee802154_nl_reply(msg, info);
+
+nla_put_failure:
+	nlmsg_free(msg);
+out_dev:
+	wpan_phy_put(phy);
+	return rc;
+}
+
+static int ieee802154_del_iface(struct sk_buff *skb,
+		struct genl_info *info)
+{
+	struct sk_buff *msg;
+	struct wpan_phy *phy;
+	const char *name;
+	int rc;
+	struct net_device *dev;
+
+	pr_debug("%s\n", __func__);
+
+	if (!info->attrs[IEEE802154_ATTR_DEV_NAME])
+		return -EINVAL;
+
+	name = nla_data(info->attrs[IEEE802154_ATTR_DEV_NAME]);
+	if (name[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1] != '\0')
+		return -EINVAL; /* name should be null-terminated */
+
+	dev = dev_get_by_name(genl_info_net(info), name);
+	if (!dev)
+		return -ENODEV;
+
+	phy = ieee802154_mlme_ops(dev)->get_phy(dev);
+	BUG_ON(!phy);
+
+	rc = -EINVAL;
+	/* phy name is optional, but should be checked if it's given */
+	if (info->attrs[IEEE802154_ATTR_PHY_NAME]) {
+		struct wpan_phy *phy2;
+
+		const char *pname =
+			nla_data(info->attrs[IEEE802154_ATTR_PHY_NAME]);
+		if (pname[nla_len(info->attrs[IEEE802154_ATTR_PHY_NAME]) - 1]
+				!= '\0')
+			/* name should be null-terminated */
+			goto out_dev;
+
+		phy2 = wpan_phy_find(pname);
+		if (!phy2)
+			goto out_dev;
+
+		if (phy != phy2) {
+			wpan_phy_put(phy2);
+			goto out_dev;
+		}
+	}
+
+	rc = -ENOBUFS;
+
+	msg = ieee802154_nl_new_reply(info, 0, IEEE802154_DEL_IFACE);
+	if (!msg)
+		goto out_dev;
+
+	if (!phy->del_iface) {
+		rc = -EINVAL;
+		goto nla_put_failure;
+	}
+
+	rtnl_lock();
+	phy->del_iface(phy, dev);
+
+	/* We don't have device anymore */
+	dev_put(dev);
+	dev = NULL;
+
+	rtnl_unlock();
+
+
+	NLA_PUT_STRING(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy));
+	NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, name);
+
+	wpan_phy_put(phy);
+
+	return ieee802154_nl_reply(msg, info);
+
+nla_put_failure:
+	nlmsg_free(msg);
+out_dev:
+	wpan_phy_put(phy);
+	if (dev)
+		dev_put(dev);
+
+	return rc;
+}
+
 static struct genl_ops ieee802154_phy_ops[] = {
 	IEEE802154_DUMP(IEEE802154_LIST_PHY, ieee802154_list_phy,
 							ieee802154_dump_phy),
+	IEEE802154_OP(IEEE802154_ADD_IFACE, ieee802154_add_iface),
+	IEEE802154_OP(IEEE802154_DEL_IFACE, ieee802154_del_iface),
 };
 
 /*
-- 
cgit v1.2.3


From 642c6d946b5cdc27d0146c41dc20b7c4d4c3ccd8 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 3 Apr 2009 01:08:48 -0700
Subject: sysctl: Make do_sysctl static

Now that all of the architectures use compat_sys_sysctl do_sysctl
can become static.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/sysctl.h | 4 ----
 kernel/sysctl_binary.c | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 1e4743ee6831..82c32b89932d 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -996,10 +996,6 @@ extern int proc_doulongvec_minmax(struct ctl_table *, int,
 extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int,
 				      void __user *, size_t *, loff_t *);
 
-extern int do_sysctl (int __user *name, int nlen,
-		      void __user *oldval, size_t __user *oldlenp,
-		      void __user *newval, size_t newlen);
-
 extern ctl_handler sysctl_data;
 extern ctl_handler sysctl_string;
 extern ctl_handler sysctl_intvec;
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 775cc49da622..642019894299 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -135,7 +135,7 @@ static void deprecated_sysctl_warning(const int *name, int nlen)
 	return;
 }
 
-int do_sysctl(int __user *args_name, int nlen,
+static int do_sysctl(int __user *args_name, int nlen,
 	void __user *oldval, size_t __user *oldlenp,
 	void __user *newval, size_t newlen)
 {
-- 
cgit v1.2.3


From 2dceba14ef0e62738d58777a1bd4018130d47a74 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 6 Nov 2009 08:09:03 +0000
Subject: compat: add struct compat_ifreq etc to compat.h

In order to move socket ioctl conversion code into multiple
places in the socket code, we need a common defintion of
the data structures it uses.

Also change the name from ifreq32 to compat_ifreq to
follow the naming convention for compat.h

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/compat.h | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index af931ee43dd8..8311d2e29632 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -10,6 +10,8 @@
 #include <linux/stat.h>
 #include <linux/param.h>	/* for HZ */
 #include <linux/sem.h>
+#include <linux/socket.h>
+#include <linux/if.h>
 
 #include <asm/compat.h>
 #include <asm/siginfo.h>
@@ -154,6 +156,43 @@ typedef struct compat_sigevent {
 	} _sigev_un;
 } compat_sigevent_t;
 
+struct compat_ifmap {
+	compat_ulong_t mem_start;
+	compat_ulong_t mem_end;
+	unsigned short base_addr;
+	unsigned char irq;
+	unsigned char dma;
+	unsigned char port;
+};
+
+struct compat_ifreq {
+#define IFHWADDRLEN     6
+#define IFNAMSIZ        16
+        union {
+                char    ifrn_name[IFNAMSIZ];            /* if name, e.g. "en0" */
+        } ifr_ifrn;
+        union {
+                struct  sockaddr ifru_addr;
+                struct  sockaddr ifru_dstaddr;
+                struct  sockaddr ifru_broadaddr;
+                struct  sockaddr ifru_netmask;
+                struct  sockaddr ifru_hwaddr;
+                short   ifru_flags;
+                compat_int_t     ifru_ivalue;
+                compat_int_t     ifru_mtu;
+                struct  compat_ifmap ifru_map;
+                char    ifru_slave[IFNAMSIZ];   /* Just fits the size */
+		char	ifru_newname[IFNAMSIZ];
+                compat_caddr_t ifru_data;
+	    /* XXXX? ifru_settings should be here */
+        } ifr_ifru;
+};
+
+struct compat_ifconf {
+        compat_int_t	ifc_len;                        /* size of buffer       */
+        compat_caddr_t  ifcbuf;
+};
+
 struct compat_robust_list {
 	compat_uptr_t			next;
 };
-- 
cgit v1.2.3


From b622d97a63ad4ce890b625c62acd1bb894592e63 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 6 Nov 2009 20:46:52 -0800
Subject: net: compat: No need to define IFHWADDRLEN and IFNAMSIZ twice.

It's defined colloqually in linux/if.h and linux/compat.h
includes that.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/compat.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 8311d2e29632..224c7a896172 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -166,8 +166,6 @@ struct compat_ifmap {
 };
 
 struct compat_ifreq {
-#define IFHWADDRLEN     6
-#define IFNAMSIZ        16
         union {
                 char    ifrn_name[IFNAMSIZ];            /* if name, e.g. "en0" */
         } ifr_ifrn;
-- 
cgit v1.2.3


From b215c57dcc847b15693899d26aa0ee4669dacefb Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 5 Nov 2009 04:37:30 +0000
Subject: net: kill proto_ops wrapper

All users of wrapped proto_ops are now gone, so we can safely remove
the wrappers as well.

Cc: David S. Miller <davem@davemloft.net>
Cc: netdev@vger.kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h | 83 -----------------------------------------------------
 1 file changed, 83 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/net.h b/include/linux/net.h
index 70ee3c310f15..6ce87663551c 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -268,89 +268,6 @@ extern int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg);
 extern int kernel_sock_shutdown(struct socket *sock,
 				enum sock_shutdown_cmd how);
 
-#ifndef CONFIG_SMP
-#define SOCKOPS_WRAPPED(name) name
-#define SOCKOPS_WRAP(name, fam)
-#else
-
-#define SOCKOPS_WRAPPED(name) __unlocked_##name
-
-#define SOCKCALL_WRAP(name, call, parms, args)		\
-static int __lock_##name##_##call  parms		\
-{							\
-	int ret;					\
-	lock_kernel();					\
-	ret = __unlocked_##name##_ops.call  args ;\
-	unlock_kernel();				\
-	return ret;					\
-}
-
-#define SOCKCALL_UWRAP(name, call, parms, args)		\
-static unsigned int __lock_##name##_##call  parms	\
-{							\
-	int ret;					\
-	lock_kernel();					\
-	ret = __unlocked_##name##_ops.call  args ;\
-	unlock_kernel();				\
-	return ret;					\
-}
-
-
-#define SOCKOPS_WRAP(name, fam)					\
-SOCKCALL_WRAP(name, release, (struct socket *sock), (sock))	\
-SOCKCALL_WRAP(name, bind, (struct socket *sock, struct sockaddr *uaddr, int addr_len), \
-	      (sock, uaddr, addr_len))				\
-SOCKCALL_WRAP(name, connect, (struct socket *sock, struct sockaddr * uaddr, \
-			      int addr_len, int flags), 	\
-	      (sock, uaddr, addr_len, flags))			\
-SOCKCALL_WRAP(name, socketpair, (struct socket *sock1, struct socket *sock2), \
-	      (sock1, sock2))					\
-SOCKCALL_WRAP(name, accept, (struct socket *sock, struct socket *newsock, \
-			 int flags), (sock, newsock, flags)) \
-SOCKCALL_WRAP(name, getname, (struct socket *sock, struct sockaddr *uaddr, \
-			 int *addr_len, int peer), (sock, uaddr, addr_len, peer)) \
-SOCKCALL_UWRAP(name, poll, (struct file *file, struct socket *sock, struct poll_table_struct *wait), \
-	      (file, sock, wait)) \
-SOCKCALL_WRAP(name, ioctl, (struct socket *sock, unsigned int cmd, \
-			 unsigned long arg), (sock, cmd, arg)) \
-SOCKCALL_WRAP(name, compat_ioctl, (struct socket *sock, unsigned int cmd, \
-			 unsigned long arg), (sock, cmd, arg)) \
-SOCKCALL_WRAP(name, listen, (struct socket *sock, int len), (sock, len)) \
-SOCKCALL_WRAP(name, shutdown, (struct socket *sock, int flags), (sock, flags)) \
-SOCKCALL_WRAP(name, setsockopt, (struct socket *sock, int level, int optname, \
-			 char __user *optval, unsigned int optlen), (sock, level, optname, optval, optlen)) \
-SOCKCALL_WRAP(name, getsockopt, (struct socket *sock, int level, int optname, \
-			 char __user *optval, int __user *optlen), (sock, level, optname, optval, optlen)) \
-SOCKCALL_WRAP(name, sendmsg, (struct kiocb *iocb, struct socket *sock, struct msghdr *m, size_t len), \
-	      (iocb, sock, m, len)) \
-SOCKCALL_WRAP(name, recvmsg, (struct kiocb *iocb, struct socket *sock, struct msghdr *m, size_t len, int flags), \
-	      (iocb, sock, m, len, flags)) \
-SOCKCALL_WRAP(name, mmap, (struct file *file, struct socket *sock, struct vm_area_struct *vma), \
-	      (file, sock, vma)) \
-	      \
-static const struct proto_ops name##_ops = {			\
-	.family		= fam,				\
-	.owner		= THIS_MODULE,			\
-	.release	= __lock_##name##_release,	\
-	.bind		= __lock_##name##_bind,		\
-	.connect	= __lock_##name##_connect,	\
-	.socketpair	= __lock_##name##_socketpair,	\
-	.accept		= __lock_##name##_accept,	\
-	.getname	= __lock_##name##_getname,	\
-	.poll		= __lock_##name##_poll,		\
-	.ioctl		= __lock_##name##_ioctl,	\
-	.compat_ioctl	= __lock_##name##_compat_ioctl,	\
-	.listen		= __lock_##name##_listen,	\
-	.shutdown	= __lock_##name##_shutdown,	\
-	.setsockopt	= __lock_##name##_setsockopt,	\
-	.getsockopt	= __lock_##name##_getsockopt,	\
-	.sendmsg	= __lock_##name##_sendmsg,	\
-	.recvmsg	= __lock_##name##_recvmsg,	\
-	.mmap		= __lock_##name##_mmap,		\
-};
-
-#endif
-
 #define MODULE_ALIAS_NETPROTO(proto) \
 	MODULE_ALIAS("net-pf-" __stringify(proto))
 
-- 
cgit v1.2.3


From 444a2a3bcd6d5bed5c823136f68fcc93c0fe283f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 6 Nov 2009 04:13:05 +0100
Subject: tracing, perf_events: Protect the buffer from recursion in perf

While tracing using events with perf, if one enables the
lockdep:lock_acquire event, it will infect every other perf
trace events.

Basically, you can enable whatever set of trace events through
perf but if this event is part of the set, the only result we
can get is a long list of lock_acquire events of rcu read lock,
and only that.

This is because of a recursion inside perf.

1) When a trace event is triggered, it will fill a per cpu
   buffer and submit it to perf.

2) Perf will commit this event but will also protect some data
   using rcu_read_lock

3) A recursion appears: rcu_read_lock triggers a lock_acquire
   event that will fill the per cpu event and then submit the
   buffer to perf.

4) Perf detects a recursion and ignores it

5) Perf continues its work on the previous event, but its buffer
   has been overwritten by the lock_acquire event, it has then
   been turned into a lock_acquire event of rcu read lock

Such scenario also happens with lock_release with
rcu_read_unlock().

We could turn the rcu_read_lock() into __rcu_read_lock() to drop
the lock debugging from perf fast path, but that would make us
lose the rcu debugging and that doesn't prevent from other
possible kind of recursion from perf in the future.

This patch adds a recursion protection based on a counter on the
perf trace per cpu buffers to solve the problem.

-v2: Fixed lost whitespace, added reviewed-by tag

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Jason Baron <jbaron@redhat.com>
LKML-Reference: <1257477185-7838-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace_event.h       |  9 +++++--
 include/trace/ftrace.h             | 39 ++++++++++++++++++++++-------
 kernel/trace/trace_event_profile.c | 41 ++++++++++++++-----------------
 kernel/trace/trace_kprobe.c        | 50 ++++++++++++++++++++++++++++++++------
 kernel/trace/trace_syscalls.c      | 44 +++++++++++++++++++++++++++------
 5 files changed, 133 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index f7b47c336703..43360c1d8f70 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -137,8 +137,13 @@ struct ftrace_event_call {
 
 #define FTRACE_MAX_PROFILE_SIZE	2048
 
-extern char			*trace_profile_buf;
-extern char			*trace_profile_buf_nmi;
+struct perf_trace_buf {
+	char	buf[FTRACE_MAX_PROFILE_SIZE];
+	int	recursion;
+};
+
+extern struct perf_trace_buf	*perf_trace_buf;
+extern struct perf_trace_buf	*perf_trace_buf_nmi;
 
 #define MAX_FILTER_PRED		32
 #define MAX_FILTER_STR_VAL	256	/* Should handle KSYM_SYMBOL_LEN */
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index a7f946094128..4945d1c99864 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -649,6 +649,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
  *	struct ftrace_event_call *event_call = &event_<call>;
  *	extern void perf_tp_event(int, u64, u64, void *, int);
  *	struct ftrace_raw_##call *entry;
+ *	struct perf_trace_buf *trace_buf;
  *	u64 __addr = 0, __count = 1;
  *	unsigned long irq_flags;
  *	struct trace_entry *ent;
@@ -673,14 +674,25 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
  *	__cpu = smp_processor_id();
  *
  *	if (in_nmi())
- *		raw_data = rcu_dereference(trace_profile_buf_nmi);
+ *		trace_buf = rcu_dereference(perf_trace_buf_nmi);
  *	else
- *		raw_data = rcu_dereference(trace_profile_buf);
+ *		trace_buf = rcu_dereference(perf_trace_buf);
  *
- *	if (!raw_data)
+ *	if (!trace_buf)
  *		goto end;
  *
- *	raw_data = per_cpu_ptr(raw_data, __cpu);
+ *	trace_buf = per_cpu_ptr(trace_buf, __cpu);
+ *
+ * 	// Avoid recursion from perf that could mess up the buffer
+ * 	if (trace_buf->recursion++)
+ *		goto end_recursion;
+ *
+ * 	raw_data = trace_buf->buf;
+ *
+ *	// Make recursion update visible before entering perf_tp_event
+ *	// so that we protect from perf recursions.
+ *
+ *	barrier();
  *
  *	//zero dead bytes from alignment to avoid stack leak to userspace:
  *	*(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;
@@ -713,8 +725,9 @@ static void ftrace_profile_##call(proto)				\
 {									\
 	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
 	struct ftrace_event_call *event_call = &event_##call;		\
-	extern void perf_tp_event(int, u64, u64, void *, int);	\
+	extern void perf_tp_event(int, u64, u64, void *, int);		\
 	struct ftrace_raw_##call *entry;				\
+	struct perf_trace_buf *trace_buf;				\
 	u64 __addr = 0, __count = 1;					\
 	unsigned long irq_flags;					\
 	struct trace_entry *ent;					\
@@ -739,14 +752,20 @@ static void ftrace_profile_##call(proto)				\
 	__cpu = smp_processor_id();					\
 									\
 	if (in_nmi())							\
-		raw_data = rcu_dereference(trace_profile_buf_nmi);		\
+		trace_buf = rcu_dereference(perf_trace_buf_nmi);	\
 	else								\
-		raw_data = rcu_dereference(trace_profile_buf);		\
+		trace_buf = rcu_dereference(perf_trace_buf);		\
 									\
-	if (!raw_data)							\
+	if (!trace_buf)							\
 		goto end;						\
 									\
-	raw_data = per_cpu_ptr(raw_data, __cpu);			\
+	trace_buf = per_cpu_ptr(trace_buf, __cpu);			\
+	if (trace_buf->recursion++)					\
+		goto end_recursion;					\
+									\
+	barrier();							\
+									\
+	raw_data = trace_buf->buf;					\
 									\
 	*(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;		\
 	entry = (struct ftrace_raw_##call *)raw_data;			\
@@ -761,6 +780,8 @@ static void ftrace_profile_##call(proto)				\
 	perf_tp_event(event_call->id, __addr, __count, entry,		\
 			     __entry_size);				\
 									\
+end_recursion:								\
+	trace_buf->recursion--;						\
 end:									\
 	local_irq_restore(irq_flags);					\
 									\
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index c9f687ab0d4f..e0d351b01f5a 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -8,41 +8,36 @@
 #include <linux/module.h>
 #include "trace.h"
 
-/*
- * We can't use a size but a type in alloc_percpu()
- * So let's create a dummy type that matches the desired size
- */
-typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
 
-char		*trace_profile_buf;
-EXPORT_SYMBOL_GPL(trace_profile_buf);
+struct perf_trace_buf *perf_trace_buf;
+EXPORT_SYMBOL_GPL(perf_trace_buf);
 
-char		*trace_profile_buf_nmi;
-EXPORT_SYMBOL_GPL(trace_profile_buf_nmi);
+struct perf_trace_buf *perf_trace_buf_nmi;
+EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
 
 /* Count the events in use (per event id, not per instance) */
 static int	total_profile_count;
 
 static int ftrace_profile_enable_event(struct ftrace_event_call *event)
 {
-	char *buf;
+	struct perf_trace_buf *buf;
 	int ret = -ENOMEM;
 
 	if (atomic_inc_return(&event->profile_count))
 		return 0;
 
 	if (!total_profile_count) {
-		buf = (char *)alloc_percpu(profile_buf_t);
+		buf = alloc_percpu(struct perf_trace_buf);
 		if (!buf)
 			goto fail_buf;
 
-		rcu_assign_pointer(trace_profile_buf, buf);
+		rcu_assign_pointer(perf_trace_buf, buf);
 
-		buf = (char *)alloc_percpu(profile_buf_t);
+		buf = alloc_percpu(struct perf_trace_buf);
 		if (!buf)
 			goto fail_buf_nmi;
 
-		rcu_assign_pointer(trace_profile_buf_nmi, buf);
+		rcu_assign_pointer(perf_trace_buf_nmi, buf);
 	}
 
 	ret = event->profile_enable(event);
@@ -53,10 +48,10 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
 
 fail_buf_nmi:
 	if (!total_profile_count) {
-		free_percpu(trace_profile_buf_nmi);
-		free_percpu(trace_profile_buf);
-		trace_profile_buf_nmi = NULL;
-		trace_profile_buf = NULL;
+		free_percpu(perf_trace_buf_nmi);
+		free_percpu(perf_trace_buf);
+		perf_trace_buf_nmi = NULL;
+		perf_trace_buf = NULL;
 	}
 fail_buf:
 	atomic_dec(&event->profile_count);
@@ -84,7 +79,7 @@ int ftrace_profile_enable(int event_id)
 
 static void ftrace_profile_disable_event(struct ftrace_event_call *event)
 {
-	char *buf, *nmi_buf;
+	struct perf_trace_buf *buf, *nmi_buf;
 
 	if (!atomic_add_negative(-1, &event->profile_count))
 		return;
@@ -92,11 +87,11 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event)
 	event->profile_disable(event);
 
 	if (!--total_profile_count) {
-		buf = trace_profile_buf;
-		rcu_assign_pointer(trace_profile_buf, NULL);
+		buf = perf_trace_buf;
+		rcu_assign_pointer(perf_trace_buf, NULL);
 
-		nmi_buf = trace_profile_buf_nmi;
-		rcu_assign_pointer(trace_profile_buf_nmi, NULL);
+		nmi_buf = perf_trace_buf_nmi;
+		rcu_assign_pointer(perf_trace_buf_nmi, NULL);
 
 		/*
 		 * Ensure every events in profiling have finished before
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index cf17a6694f32..3696476f307d 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1208,6 +1208,7 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
 	struct ftrace_event_call *call = &tp->call;
 	struct kprobe_trace_entry *entry;
+	struct perf_trace_buf *trace_buf;
 	struct trace_entry *ent;
 	int size, __size, i, pc, __cpu;
 	unsigned long irq_flags;
@@ -1229,14 +1230,26 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 	__cpu = smp_processor_id();
 
 	if (in_nmi())
-		raw_data = rcu_dereference(trace_profile_buf_nmi);
+		trace_buf = rcu_dereference(perf_trace_buf_nmi);
 	else
-		raw_data = rcu_dereference(trace_profile_buf);
+		trace_buf = rcu_dereference(perf_trace_buf);
 
-	if (!raw_data)
+	if (!trace_buf)
 		goto end;
 
-	raw_data = per_cpu_ptr(raw_data, __cpu);
+	trace_buf = per_cpu_ptr(trace_buf, __cpu);
+
+	if (trace_buf->recursion++)
+		goto end_recursion;
+
+	/*
+	 * Make recursion update visible before entering perf_tp_event
+	 * so that we protect from perf recursions.
+	 */
+	barrier();
+
+	raw_data = trace_buf->buf;
+
 	/* Zero dead bytes from alignment to avoid buffer leak to userspace */
 	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
 	entry = (struct kprobe_trace_entry *)raw_data;
@@ -1249,8 +1262,12 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 	for (i = 0; i < tp->nr_args; i++)
 		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
 	perf_tp_event(call->id, entry->ip, 1, entry, size);
+
+end_recursion:
+	trace_buf->recursion--;
 end:
 	local_irq_restore(irq_flags);
+
 	return 0;
 }
 
@@ -1261,6 +1278,7 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
 	struct ftrace_event_call *call = &tp->call;
 	struct kretprobe_trace_entry *entry;
+	struct perf_trace_buf *trace_buf;
 	struct trace_entry *ent;
 	int size, __size, i, pc, __cpu;
 	unsigned long irq_flags;
@@ -1282,14 +1300,26 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 	__cpu = smp_processor_id();
 
 	if (in_nmi())
-		raw_data = rcu_dereference(trace_profile_buf_nmi);
+		trace_buf = rcu_dereference(perf_trace_buf_nmi);
 	else
-		raw_data = rcu_dereference(trace_profile_buf);
+		trace_buf = rcu_dereference(perf_trace_buf);
 
-	if (!raw_data)
+	if (!trace_buf)
 		goto end;
 
-	raw_data = per_cpu_ptr(raw_data, __cpu);
+	trace_buf = per_cpu_ptr(trace_buf, __cpu);
+
+	if (trace_buf->recursion++)
+		goto end_recursion;
+
+	/*
+	 * Make recursion update visible before entering perf_tp_event
+	 * so that we protect from perf recursions.
+	 */
+	barrier();
+
+	raw_data = trace_buf->buf;
+
 	/* Zero dead bytes from alignment to avoid buffer leak to userspace */
 	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
 	entry = (struct kretprobe_trace_entry *)raw_data;
@@ -1303,8 +1333,12 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 	for (i = 0; i < tp->nr_args; i++)
 		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
 	perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
+
+end_recursion:
+	trace_buf->recursion--;
 end:
 	local_irq_restore(irq_flags);
+
 	return 0;
 }
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 58b8e5370767..51213b0aa81b 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -477,6 +477,7 @@ static int sys_prof_refcount_exit;
 static void prof_syscall_enter(struct pt_regs *regs, long id)
 {
 	struct syscall_metadata *sys_data;
+	struct perf_trace_buf *trace_buf;
 	struct syscall_trace_enter *rec;
 	unsigned long flags;
 	char *raw_data;
@@ -507,14 +508,25 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 	cpu = smp_processor_id();
 
 	if (in_nmi())
-		raw_data = rcu_dereference(trace_profile_buf_nmi);
+		trace_buf = rcu_dereference(perf_trace_buf_nmi);
 	else
-		raw_data = rcu_dereference(trace_profile_buf);
+		trace_buf = rcu_dereference(perf_trace_buf);
 
-	if (!raw_data)
+	if (!trace_buf)
 		goto end;
 
-	raw_data = per_cpu_ptr(raw_data, cpu);
+	trace_buf = per_cpu_ptr(trace_buf, cpu);
+
+	if (trace_buf->recursion++)
+		goto end_recursion;
+
+	/*
+	 * Make recursion update visible before entering perf_tp_event
+	 * so that we protect from perf recursions.
+	 */
+	barrier();
+
+	raw_data = trace_buf->buf;
 
 	/* zero the dead bytes from align to not leak stack to user */
 	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -527,6 +539,8 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 			       (unsigned long *)&rec->args);
 	perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
 
+end_recursion:
+	trace_buf->recursion--;
 end:
 	local_irq_restore(flags);
 }
@@ -574,6 +588,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_exit *rec;
+	struct perf_trace_buf *trace_buf;
 	unsigned long flags;
 	int syscall_nr;
 	char *raw_data;
@@ -605,14 +620,25 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 	cpu = smp_processor_id();
 
 	if (in_nmi())
-		raw_data = rcu_dereference(trace_profile_buf_nmi);
+		trace_buf = rcu_dereference(perf_trace_buf_nmi);
 	else
-		raw_data = rcu_dereference(trace_profile_buf);
+		trace_buf = rcu_dereference(perf_trace_buf);
 
-	if (!raw_data)
+	if (!trace_buf)
 		goto end;
 
-	raw_data = per_cpu_ptr(raw_data, cpu);
+	trace_buf = per_cpu_ptr(trace_buf, cpu);
+
+	if (trace_buf->recursion++)
+		goto end_recursion;
+
+	/*
+	 * Make recursion update visible before entering perf_tp_event
+	 * so that we protect from perf recursions.
+	 */
+	barrier();
+
+	raw_data = trace_buf->buf;
 
 	/* zero the dead bytes from align to not leak stack to user */
 	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -626,6 +652,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 
 	perf_tp_event(sys_data->exit_id, 0, 1, rec, size);
 
+end_recursion:
+	trace_buf->recursion--;
 end:
 	local_irq_restore(flags);
 }
-- 
cgit v1.2.3


From e0000163e30eeb112b41486ea113fd54f64e1f17 Mon Sep 17 00:00:00 2001
From: Christian Pellegrin <chripell@fsfe.org>
Date: Mon, 2 Nov 2009 23:07:00 +0000
Subject: can: Driver for the Microchip MCP251x SPI CAN controllers

Signed-off-by: Christian Pellegrin <chripell@fsfe.org>
Signed-off-by: Wolfgang Grandegger <wg@grandegger.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/can/Kconfig              |    6 +
 drivers/net/can/Makefile             |    1 +
 drivers/net/can/mcp251x.c            | 1164 ++++++++++++++++++++++++++++++++++
 include/linux/can/platform/mcp251x.h |   36 ++
 4 files changed, 1207 insertions(+)
 create mode 100644 drivers/net/can/mcp251x.c
 create mode 100644 include/linux/can/platform/mcp251x.h

(limited to 'include/linux')

diff --git a/drivers/net/can/Kconfig b/drivers/net/can/Kconfig
index 26d77cc0ded7..b819cc2a429e 100644
--- a/drivers/net/can/Kconfig
+++ b/drivers/net/can/Kconfig
@@ -102,6 +102,12 @@ config CAN_TI_HECC
 	  Driver for TI HECC (High End CAN Controller) module found on many
 	  TI devices. The device specifications are available from www.ti.com
 
+config CAN_MCP251X
+	tristate "Microchip MCP251x SPI CAN controllers"
+	depends on CAN_DEV && SPI
+	---help---
+	  Driver for the Microchip MCP251x SPI CAN controllers.
+
 config CAN_DEBUG_DEVICES
 	bool "CAN devices debugging messages"
 	depends on CAN
diff --git a/drivers/net/can/Makefile b/drivers/net/can/Makefile
index 31f4ab5df28b..14891817ea5b 100644
--- a/drivers/net/can/Makefile
+++ b/drivers/net/can/Makefile
@@ -12,5 +12,6 @@ obj-y				+= usb/
 obj-$(CONFIG_CAN_SJA1000)	+= sja1000/
 obj-$(CONFIG_CAN_AT91)		+= at91_can.o
 obj-$(CONFIG_CAN_TI_HECC)	+= ti_hecc.o
+obj-$(CONFIG_CAN_MCP251X)	+= mcp251x.o
 
 ccflags-$(CONFIG_CAN_DEBUG_DEVICES) := -DDEBUG
diff --git a/drivers/net/can/mcp251x.c b/drivers/net/can/mcp251x.c
new file mode 100644
index 000000000000..8f48f4b50b7c
--- /dev/null
+++ b/drivers/net/can/mcp251x.c
@@ -0,0 +1,1164 @@
+/*
+ * CAN bus driver for Microchip 251x CAN Controller with SPI Interface
+ *
+ * MCP2510 support and bug fixes by Christian Pellegrin
+ * <chripell@evolware.org>
+ *
+ * Copyright 2009 Christian Pellegrin EVOL S.r.l.
+ *
+ * Copyright 2007 Raymarine UK, Ltd. All Rights Reserved.
+ * Written under contract by:
+ *   Chris Elston, Katalix Systems, Ltd.
+ *
+ * Based on Microchip MCP251x CAN controller driver written by
+ * David Vrabel, Copyright 2006 Arcom Control Systems Ltd.
+ *
+ * Based on CAN bus driver for the CCAN controller written by
+ * - Sascha Hauer, Marc Kleine-Budde, Pengutronix
+ * - Simon Kallweit, intefo AG
+ * Copyright 2007
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the version 2 of the GNU General Public License
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *
+ *
+ * Your platform definition file should specify something like:
+ *
+ * static struct mcp251x_platform_data mcp251x_info = {
+ *         .oscillator_frequency = 8000000,
+ *         .board_specific_setup = &mcp251x_setup,
+ *         .model = CAN_MCP251X_MCP2510,
+ *         .power_enable = mcp251x_power_enable,
+ *         .transceiver_enable = NULL,
+ * };
+ *
+ * static struct spi_board_info spi_board_info[] = {
+ *         {
+ *                 .modalias = "mcp251x",
+ *                 .platform_data = &mcp251x_info,
+ *                 .irq = IRQ_EINT13,
+ *                 .max_speed_hz = 2*1000*1000,
+ *                 .chip_select = 2,
+ *         },
+ * };
+ *
+ * Please see mcp251x.h for a description of the fields in
+ * struct mcp251x_platform_data.
+ *
+ */
+
+#include <linux/can.h>
+#include <linux/can/core.h>
+#include <linux/can/dev.h>
+#include <linux/can/platform/mcp251x.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/freezer.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/platform_device.h>
+#include <linux/spi/spi.h>
+#include <linux/uaccess.h>
+
+/* SPI interface instruction set */
+#define INSTRUCTION_WRITE	0x02
+#define INSTRUCTION_READ	0x03
+#define INSTRUCTION_BIT_MODIFY	0x05
+#define INSTRUCTION_LOAD_TXB(n)	(0x40 + 2 * (n))
+#define INSTRUCTION_READ_RXB(n)	(((n) == 0) ? 0x90 : 0x94)
+#define INSTRUCTION_RESET	0xC0
+
+/* MPC251x registers */
+#define CANSTAT	      0x0e
+#define CANCTRL	      0x0f
+#  define CANCTRL_REQOP_MASK	    0xe0
+#  define CANCTRL_REQOP_CONF	    0x80
+#  define CANCTRL_REQOP_LISTEN_ONLY 0x60
+#  define CANCTRL_REQOP_LOOPBACK    0x40
+#  define CANCTRL_REQOP_SLEEP	    0x20
+#  define CANCTRL_REQOP_NORMAL	    0x00
+#  define CANCTRL_OSM		    0x08
+#  define CANCTRL_ABAT		    0x10
+#define TEC	      0x1c
+#define REC	      0x1d
+#define CNF1	      0x2a
+#  define CNF1_SJW_SHIFT   6
+#define CNF2	      0x29
+#  define CNF2_BTLMODE	   0x80
+#  define CNF2_SAM         0x40
+#  define CNF2_PS1_SHIFT   3
+#define CNF3	      0x28
+#  define CNF3_SOF	   0x08
+#  define CNF3_WAKFIL	   0x04
+#  define CNF3_PHSEG2_MASK 0x07
+#define CANINTE	      0x2b
+#  define CANINTE_MERRE 0x80
+#  define CANINTE_WAKIE 0x40
+#  define CANINTE_ERRIE 0x20
+#  define CANINTE_TX2IE 0x10
+#  define CANINTE_TX1IE 0x08
+#  define CANINTE_TX0IE 0x04
+#  define CANINTE_RX1IE 0x02
+#  define CANINTE_RX0IE 0x01
+#define CANINTF	      0x2c
+#  define CANINTF_MERRF 0x80
+#  define CANINTF_WAKIF 0x40
+#  define CANINTF_ERRIF 0x20
+#  define CANINTF_TX2IF 0x10
+#  define CANINTF_TX1IF 0x08
+#  define CANINTF_TX0IF 0x04
+#  define CANINTF_RX1IF 0x02
+#  define CANINTF_RX0IF 0x01
+#define EFLG	      0x2d
+#  define EFLG_EWARN	0x01
+#  define EFLG_RXWAR	0x02
+#  define EFLG_TXWAR	0x04
+#  define EFLG_RXEP	0x08
+#  define EFLG_TXEP	0x10
+#  define EFLG_TXBO	0x20
+#  define EFLG_RX0OVR	0x40
+#  define EFLG_RX1OVR	0x80
+#define TXBCTRL(n)  (((n) * 0x10) + 0x30 + TXBCTRL_OFF)
+#  define TXBCTRL_ABTF	0x40
+#  define TXBCTRL_MLOA	0x20
+#  define TXBCTRL_TXERR 0x10
+#  define TXBCTRL_TXREQ 0x08
+#define TXBSIDH(n)  (((n) * 0x10) + 0x30 + TXBSIDH_OFF)
+#  define SIDH_SHIFT    3
+#define TXBSIDL(n)  (((n) * 0x10) + 0x30 + TXBSIDL_OFF)
+#  define SIDL_SID_MASK    7
+#  define SIDL_SID_SHIFT   5
+#  define SIDL_EXIDE_SHIFT 3
+#  define SIDL_EID_SHIFT   16
+#  define SIDL_EID_MASK    3
+#define TXBEID8(n)  (((n) * 0x10) + 0x30 + TXBEID8_OFF)
+#define TXBEID0(n)  (((n) * 0x10) + 0x30 + TXBEID0_OFF)
+#define TXBDLC(n)   (((n) * 0x10) + 0x30 + TXBDLC_OFF)
+#  define DLC_RTR_SHIFT    6
+#define TXBCTRL_OFF 0
+#define TXBSIDH_OFF 1
+#define TXBSIDL_OFF 2
+#define TXBEID8_OFF 3
+#define TXBEID0_OFF 4
+#define TXBDLC_OFF  5
+#define TXBDAT_OFF  6
+#define RXBCTRL(n)  (((n) * 0x10) + 0x60 + RXBCTRL_OFF)
+#  define RXBCTRL_BUKT	0x04
+#  define RXBCTRL_RXM0	0x20
+#  define RXBCTRL_RXM1	0x40
+#define RXBSIDH(n)  (((n) * 0x10) + 0x60 + RXBSIDH_OFF)
+#  define RXBSIDH_SHIFT 3
+#define RXBSIDL(n)  (((n) * 0x10) + 0x60 + RXBSIDL_OFF)
+#  define RXBSIDL_IDE   0x08
+#  define RXBSIDL_EID   3
+#  define RXBSIDL_SHIFT 5
+#define RXBEID8(n)  (((n) * 0x10) + 0x60 + RXBEID8_OFF)
+#define RXBEID0(n)  (((n) * 0x10) + 0x60 + RXBEID0_OFF)
+#define RXBDLC(n)   (((n) * 0x10) + 0x60 + RXBDLC_OFF)
+#  define RXBDLC_LEN_MASK  0x0f
+#  define RXBDLC_RTR       0x40
+#define RXBCTRL_OFF 0
+#define RXBSIDH_OFF 1
+#define RXBSIDL_OFF 2
+#define RXBEID8_OFF 3
+#define RXBEID0_OFF 4
+#define RXBDLC_OFF  5
+#define RXBDAT_OFF  6
+
+#define GET_BYTE(val, byte)			\
+	(((val) >> ((byte) * 8)) & 0xff)
+#define SET_BYTE(val, byte)			\
+	(((val) & 0xff) << ((byte) * 8))
+
+/*
+ * Buffer size required for the largest SPI transfer (i.e., reading a
+ * frame)
+ */
+#define CAN_FRAME_MAX_DATA_LEN	8
+#define SPI_TRANSFER_BUF_LEN	(6 + CAN_FRAME_MAX_DATA_LEN)
+#define CAN_FRAME_MAX_BITS	128
+
+#define TX_ECHO_SKB_MAX	1
+
+#define DEVICE_NAME "mcp251x"
+
+static int mcp251x_enable_dma; /* Enable SPI DMA. Default: 0 (Off) */
+module_param(mcp251x_enable_dma, int, S_IRUGO);
+MODULE_PARM_DESC(mcp251x_enable_dma, "Enable SPI DMA. Default: 0 (Off)");
+
+static struct can_bittiming_const mcp251x_bittiming_const = {
+	.name = DEVICE_NAME,
+	.tseg1_min = 3,
+	.tseg1_max = 16,
+	.tseg2_min = 2,
+	.tseg2_max = 8,
+	.sjw_max = 4,
+	.brp_min = 1,
+	.brp_max = 64,
+	.brp_inc = 1,
+};
+
+struct mcp251x_priv {
+	struct can_priv	   can;
+	struct net_device *net;
+	struct spi_device *spi;
+
+	struct mutex spi_lock; /* SPI buffer lock */
+	u8 *spi_tx_buf;
+	u8 *spi_rx_buf;
+	dma_addr_t spi_tx_dma;
+	dma_addr_t spi_rx_dma;
+
+	struct sk_buff *tx_skb;
+	int tx_len;
+	struct workqueue_struct *wq;
+	struct work_struct tx_work;
+	struct work_struct irq_work;
+	struct completion awake;
+	int wake;
+	int force_quit;
+	int after_suspend;
+#define AFTER_SUSPEND_UP 1
+#define AFTER_SUSPEND_DOWN 2
+#define AFTER_SUSPEND_POWER 4
+#define AFTER_SUSPEND_RESTART 8
+	int restart_tx;
+};
+
+static void mcp251x_clean(struct net_device *net)
+{
+	struct mcp251x_priv *priv = netdev_priv(net);
+
+	net->stats.tx_errors++;
+	if (priv->tx_skb)
+		dev_kfree_skb(priv->tx_skb);
+	if (priv->tx_len)
+		can_free_echo_skb(priv->net, 0);
+	priv->tx_skb = NULL;
+	priv->tx_len = 0;
+}
+
+/*
+ * Note about handling of error return of mcp251x_spi_trans: accessing
+ * registers via SPI is not really different conceptually than using
+ * normal I/O assembler instructions, although it's much more
+ * complicated from a practical POV. So it's not advisable to always
+ * check the return value of this function. Imagine that every
+ * read{b,l}, write{b,l} and friends would be bracketed in "if ( < 0)
+ * error();", it would be a great mess (well there are some situation
+ * when exception handling C++ like could be useful after all). So we
+ * just check that transfers are OK at the beginning of our
+ * conversation with the chip and to avoid doing really nasty things
+ * (like injecting bogus packets in the network stack).
+ */
+static int mcp251x_spi_trans(struct spi_device *spi, int len)
+{
+	struct mcp251x_priv *priv = dev_get_drvdata(&spi->dev);
+	struct spi_transfer t = {
+		.tx_buf = priv->spi_tx_buf,
+		.rx_buf = priv->spi_rx_buf,
+		.len = len,
+		.cs_change = 0,
+	};
+	struct spi_message m;
+	int ret;
+
+	spi_message_init(&m);
+
+	if (mcp251x_enable_dma) {
+		t.tx_dma = priv->spi_tx_dma;
+		t.rx_dma = priv->spi_rx_dma;
+		m.is_dma_mapped = 1;
+	}
+
+	spi_message_add_tail(&t, &m);
+
+	ret = spi_sync(spi, &m);
+	if (ret)
+		dev_err(&spi->dev, "spi transfer failed: ret = %d\n", ret);
+	return ret;
+}
+
+static u8 mcp251x_read_reg(struct spi_device *spi, uint8_t reg)
+{
+	struct mcp251x_priv *priv = dev_get_drvdata(&spi->dev);
+	u8 val = 0;
+
+	mutex_lock(&priv->spi_lock);
+
+	priv->spi_tx_buf[0] = INSTRUCTION_READ;
+	priv->spi_tx_buf[1] = reg;
+
+	mcp251x_spi_trans(spi, 3);
+	val = priv->spi_rx_buf[2];
+
+	mutex_unlock(&priv->spi_lock);
+
+	return val;
+}
+
+static void mcp251x_write_reg(struct spi_device *spi, u8 reg, uint8_t val)
+{
+	struct mcp251x_priv *priv = dev_get_drvdata(&spi->dev);
+
+	mutex_lock(&priv->spi_lock);
+
+	priv->spi_tx_buf[0] = INSTRUCTION_WRITE;
+	priv->spi_tx_buf[1] = reg;
+	priv->spi_tx_buf[2] = val;
+
+	mcp251x_spi_trans(spi, 3);
+
+	mutex_unlock(&priv->spi_lock);
+}
+
+static void mcp251x_write_bits(struct spi_device *spi, u8 reg,
+			       u8 mask, uint8_t val)
+{
+	struct mcp251x_priv *priv = dev_get_drvdata(&spi->dev);
+
+	mutex_lock(&priv->spi_lock);
+
+	priv->spi_tx_buf[0] = INSTRUCTION_BIT_MODIFY;
+	priv->spi_tx_buf[1] = reg;
+	priv->spi_tx_buf[2] = mask;
+	priv->spi_tx_buf[3] = val;
+
+	mcp251x_spi_trans(spi, 4);
+
+	mutex_unlock(&priv->spi_lock);
+}
+
+static void mcp251x_hw_tx_frame(struct spi_device *spi, u8 *buf,
+				int len, int tx_buf_idx)
+{
+	struct mcp251x_platform_data *pdata = spi->dev.platform_data;
+	struct mcp251x_priv *priv = dev_get_drvdata(&spi->dev);
+
+	if (pdata->model == CAN_MCP251X_MCP2510) {
+		int i;
+
+		for (i = 1; i < TXBDAT_OFF + len; i++)
+			mcp251x_write_reg(spi, TXBCTRL(tx_buf_idx) + i,
+					  buf[i]);
+	} else {
+		mutex_lock(&priv->spi_lock);
+		memcpy(priv->spi_tx_buf, buf, TXBDAT_OFF + len);
+		mcp251x_spi_trans(spi, TXBDAT_OFF + len);
+		mutex_unlock(&priv->spi_lock);
+	}
+}
+
+static void mcp251x_hw_tx(struct spi_device *spi, struct can_frame *frame,
+			  int tx_buf_idx)
+{
+	u32 sid, eid, exide, rtr;
+	u8 buf[SPI_TRANSFER_BUF_LEN];
+
+	exide = (frame->can_id & CAN_EFF_FLAG) ? 1 : 0; /* Extended ID Enable */
+	if (exide)
+		sid = (frame->can_id & CAN_EFF_MASK) >> 18;
+	else
+		sid = frame->can_id & CAN_SFF_MASK; /* Standard ID */
+	eid = frame->can_id & CAN_EFF_MASK; /* Extended ID */
+	rtr = (frame->can_id & CAN_RTR_FLAG) ? 1 : 0; /* Remote transmission */
+
+	buf[TXBCTRL_OFF] = INSTRUCTION_LOAD_TXB(tx_buf_idx);
+	buf[TXBSIDH_OFF] = sid >> SIDH_SHIFT;
+	buf[TXBSIDL_OFF] = ((sid & SIDL_SID_MASK) << SIDL_SID_SHIFT) |
+		(exide << SIDL_EXIDE_SHIFT) |
+		((eid >> SIDL_EID_SHIFT) & SIDL_EID_MASK);
+	buf[TXBEID8_OFF] = GET_BYTE(eid, 1);
+	buf[TXBEID0_OFF] = GET_BYTE(eid, 0);
+	buf[TXBDLC_OFF] = (rtr << DLC_RTR_SHIFT) | frame->can_dlc;
+	memcpy(buf + TXBDAT_OFF, frame->data, frame->can_dlc);
+	mcp251x_hw_tx_frame(spi, buf, frame->can_dlc, tx_buf_idx);
+	mcp251x_write_reg(spi, TXBCTRL(tx_buf_idx), TXBCTRL_TXREQ);
+}
+
+static void mcp251x_hw_rx_frame(struct spi_device *spi, u8 *buf,
+				int buf_idx)
+{
+	struct mcp251x_priv *priv = dev_get_drvdata(&spi->dev);
+	struct mcp251x_platform_data *pdata = spi->dev.platform_data;
+
+	if (pdata->model == CAN_MCP251X_MCP2510) {
+		int i, len;
+
+		for (i = 1; i < RXBDAT_OFF; i++)
+			buf[i] = mcp251x_read_reg(spi, RXBCTRL(buf_idx) + i);
+		len = buf[RXBDLC_OFF] & RXBDLC_LEN_MASK;
+		if (len > 8)
+			len = 8;
+		for (; i < (RXBDAT_OFF + len); i++)
+			buf[i] = mcp251x_read_reg(spi, RXBCTRL(buf_idx) + i);
+	} else {
+		mutex_lock(&priv->spi_lock);
+
+		priv->spi_tx_buf[RXBCTRL_OFF] = INSTRUCTION_READ_RXB(buf_idx);
+		mcp251x_spi_trans(spi, SPI_TRANSFER_BUF_LEN);
+		memcpy(buf, priv->spi_rx_buf, SPI_TRANSFER_BUF_LEN);
+
+		mutex_unlock(&priv->spi_lock);
+	}
+}
+
+static void mcp251x_hw_rx(struct spi_device *spi, int buf_idx)
+{
+	struct mcp251x_priv *priv = dev_get_drvdata(&spi->dev);
+	struct sk_buff *skb;
+	struct can_frame *frame;
+	u8 buf[SPI_TRANSFER_BUF_LEN];
+
+	skb = alloc_can_skb(priv->net, &frame);
+	if (!skb) {
+		dev_err(&spi->dev, "cannot allocate RX skb\n");
+		priv->net->stats.rx_dropped++;
+		return;
+	}
+
+	mcp251x_hw_rx_frame(spi, buf, buf_idx);
+	if (buf[RXBSIDL_OFF] & RXBSIDL_IDE) {
+		/* Extended ID format */
+		frame->can_id = CAN_EFF_FLAG;
+		frame->can_id |=
+			/* Extended ID part */
+			SET_BYTE(buf[RXBSIDL_OFF] & RXBSIDL_EID, 2) |
+			SET_BYTE(buf[RXBEID8_OFF], 1) |
+			SET_BYTE(buf[RXBEID0_OFF], 0) |
+			/* Standard ID part */
+			(((buf[RXBSIDH_OFF] << RXBSIDH_SHIFT) |
+			  (buf[RXBSIDL_OFF] >> RXBSIDL_SHIFT)) << 18);
+		/* Remote transmission request */
+		if (buf[RXBDLC_OFF] & RXBDLC_RTR)
+			frame->can_id |= CAN_RTR_FLAG;
+	} else {
+		/* Standard ID format */
+		frame->can_id =
+			(buf[RXBSIDH_OFF] << RXBSIDH_SHIFT) |
+			(buf[RXBSIDL_OFF] >> RXBSIDL_SHIFT);
+	}
+	/* Data length */
+	frame->can_dlc = buf[RXBDLC_OFF] & RXBDLC_LEN_MASK;
+	if (frame->can_dlc > 8) {
+		dev_warn(&spi->dev, "invalid frame recevied\n");
+		priv->net->stats.rx_errors++;
+		dev_kfree_skb(skb);
+		return;
+	}
+	memcpy(frame->data, buf + RXBDAT_OFF, frame->can_dlc);
+
+	priv->net->stats.rx_packets++;
+	priv->net->stats.rx_bytes += frame->can_dlc;
+	netif_rx(skb);
+}
+
+static void mcp251x_hw_sleep(struct spi_device *spi)
+{
+	mcp251x_write_reg(spi, CANCTRL, CANCTRL_REQOP_SLEEP);
+}
+
+static void mcp251x_hw_wakeup(struct spi_device *spi)
+{
+	struct mcp251x_priv *priv = dev_get_drvdata(&spi->dev);
+
+	priv->wake = 1;
+
+	/* Can only wake up by generating a wake-up interrupt. */
+	mcp251x_write_bits(spi, CANINTE, CANINTE_WAKIE, CANINTE_WAKIE);
+	mcp251x_write_bits(spi, CANINTF, CANINTF_WAKIF, CANINTF_WAKIF);
+
+	/* Wait until the device is awake */
+	if (!wait_for_completion_timeout(&priv->awake, HZ))
+		dev_err(&spi->dev, "MCP251x didn't wake-up\n");
+}
+
+static netdev_tx_t mcp251x_hard_start_xmit(struct sk_buff *skb,
+					   struct net_device *net)
+{
+	struct mcp251x_priv *priv = netdev_priv(net);
+	struct spi_device *spi = priv->spi;
+
+	if (priv->tx_skb || priv->tx_len) {
+		dev_warn(&spi->dev, "hard_xmit called while tx busy\n");
+		netif_stop_queue(net);
+		return NETDEV_TX_BUSY;
+	}
+
+	if (skb->len != sizeof(struct can_frame)) {
+		dev_err(&spi->dev, "dropping packet - bad length\n");
+		dev_kfree_skb(skb);
+		net->stats.tx_dropped++;
+		return NETDEV_TX_OK;
+	}
+
+	netif_stop_queue(net);
+	priv->tx_skb = skb;
+	net->trans_start = jiffies;
+	queue_work(priv->wq, &priv->tx_work);
+
+	return NETDEV_TX_OK;
+}
+
+static int mcp251x_do_set_mode(struct net_device *net, enum can_mode mode)
+{
+	struct mcp251x_priv *priv = netdev_priv(net);
+
+	switch (mode) {
+	case CAN_MODE_START:
+		/* We have to delay work since SPI I/O may sleep */
+		priv->can.state = CAN_STATE_ERROR_ACTIVE;
+		priv->restart_tx = 1;
+		if (priv->can.restart_ms == 0)
+			priv->after_suspend = AFTER_SUSPEND_RESTART;
+		queue_work(priv->wq, &priv->irq_work);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static void mcp251x_set_normal_mode(struct spi_device *spi)
+{
+	struct mcp251x_priv *priv = dev_get_drvdata(&spi->dev);
+	unsigned long timeout;
+
+	/* Enable interrupts */
+	mcp251x_write_reg(spi, CANINTE,
+			  CANINTE_ERRIE | CANINTE_TX2IE | CANINTE_TX1IE |
+			  CANINTE_TX0IE | CANINTE_RX1IE | CANINTE_RX0IE |
+			  CANINTF_MERRF);
+
+	if (priv->can.ctrlmode & CAN_CTRLMODE_LOOPBACK) {
+		/* Put device into loopback mode */
+		mcp251x_write_reg(spi, CANCTRL, CANCTRL_REQOP_LOOPBACK);
+	} else {
+		/* Put device into normal mode */
+		mcp251x_write_reg(spi, CANCTRL, CANCTRL_REQOP_NORMAL);
+
+		/* Wait for the device to enter normal mode */
+		timeout = jiffies + HZ;
+		while (mcp251x_read_reg(spi, CANSTAT) & CANCTRL_REQOP_MASK) {
+			schedule();
+			if (time_after(jiffies, timeout)) {
+				dev_err(&spi->dev, "MCP251x didn't"
+					" enter in normal mode\n");
+				return;
+			}
+		}
+	}
+	priv->can.state = CAN_STATE_ERROR_ACTIVE;
+}
+
+static int mcp251x_do_set_bittiming(struct net_device *net)
+{
+	struct mcp251x_priv *priv = netdev_priv(net);
+	struct can_bittiming *bt = &priv->can.bittiming;
+	struct spi_device *spi = priv->spi;
+
+	mcp251x_write_reg(spi, CNF1, ((bt->sjw - 1) << CNF1_SJW_SHIFT) |
+			  (bt->brp - 1));
+	mcp251x_write_reg(spi, CNF2, CNF2_BTLMODE |
+			  (priv->can.ctrlmode & CAN_CTRLMODE_3_SAMPLES ?
+			   CNF2_SAM : 0) |
+			  ((bt->phase_seg1 - 1) << CNF2_PS1_SHIFT) |
+			  (bt->prop_seg - 1));
+	mcp251x_write_bits(spi, CNF3, CNF3_PHSEG2_MASK,
+			   (bt->phase_seg2 - 1));
+	dev_info(&spi->dev, "CNF: 0x%02x 0x%02x 0x%02x\n",
+		 mcp251x_read_reg(spi, CNF1),
+		 mcp251x_read_reg(spi, CNF2),
+		 mcp251x_read_reg(spi, CNF3));
+
+	return 0;
+}
+
+static int mcp251x_setup(struct net_device *net, struct mcp251x_priv *priv,
+			 struct spi_device *spi)
+{
+	int ret;
+
+	ret = open_candev(net);
+	if (ret) {
+		dev_err(&spi->dev, "unable to set initial baudrate!\n");
+		return ret;
+	}
+
+	/* Enable RX0->RX1 buffer roll over and disable filters */
+	mcp251x_write_bits(spi, RXBCTRL(0),
+			   RXBCTRL_BUKT | RXBCTRL_RXM0 | RXBCTRL_RXM1,
+			   RXBCTRL_BUKT | RXBCTRL_RXM0 | RXBCTRL_RXM1);
+	mcp251x_write_bits(spi, RXBCTRL(1),
+			   RXBCTRL_RXM0 | RXBCTRL_RXM1,
+			   RXBCTRL_RXM0 | RXBCTRL_RXM1);
+	return 0;
+}
+
+static void mcp251x_hw_reset(struct spi_device *spi)
+{
+	struct mcp251x_priv *priv = dev_get_drvdata(&spi->dev);
+	int ret;
+
+	mutex_lock(&priv->spi_lock);
+
+	priv->spi_tx_buf[0] = INSTRUCTION_RESET;
+
+	ret = spi_write(spi, priv->spi_tx_buf, 1);
+
+	mutex_unlock(&priv->spi_lock);
+
+	if (ret)
+		dev_err(&spi->dev, "reset failed: ret = %d\n", ret);
+	/* Wait for reset to finish */
+	mdelay(10);
+}
+
+static int mcp251x_hw_probe(struct spi_device *spi)
+{
+	int st1, st2;
+
+	mcp251x_hw_reset(spi);
+
+	/*
+	 * Please note that these are "magic values" based on after
+	 * reset defaults taken from data sheet which allows us to see
+	 * if we really have a chip on the bus (we avoid common all
+	 * zeroes or all ones situations)
+	 */
+	st1 = mcp251x_read_reg(spi, CANSTAT) & 0xEE;
+	st2 = mcp251x_read_reg(spi, CANCTRL) & 0x17;
+
+	dev_dbg(&spi->dev, "CANSTAT 0x%02x CANCTRL 0x%02x\n", st1, st2);
+
+	/* Check for power up default values */
+	return (st1 == 0x80 && st2 == 0x07) ? 1 : 0;
+}
+
+static irqreturn_t mcp251x_can_isr(int irq, void *dev_id)
+{
+	struct net_device *net = (struct net_device *)dev_id;
+	struct mcp251x_priv *priv = netdev_priv(net);
+
+	/* Schedule bottom half */
+	if (!work_pending(&priv->irq_work))
+		queue_work(priv->wq, &priv->irq_work);
+
+	return IRQ_HANDLED;
+}
+
+static int mcp251x_open(struct net_device *net)
+{
+	struct mcp251x_priv *priv = netdev_priv(net);
+	struct spi_device *spi = priv->spi;
+	struct mcp251x_platform_data *pdata = spi->dev.platform_data;
+	int ret;
+
+	if (pdata->transceiver_enable)
+		pdata->transceiver_enable(1);
+
+	priv->force_quit = 0;
+	priv->tx_skb = NULL;
+	priv->tx_len = 0;
+
+	ret = request_irq(spi->irq, mcp251x_can_isr,
+			  IRQF_TRIGGER_FALLING, DEVICE_NAME, net);
+	if (ret) {
+		dev_err(&spi->dev, "failed to acquire irq %d\n", spi->irq);
+		if (pdata->transceiver_enable)
+			pdata->transceiver_enable(0);
+		return ret;
+	}
+
+	mcp251x_hw_wakeup(spi);
+	mcp251x_hw_reset(spi);
+	ret = mcp251x_setup(net, priv, spi);
+	if (ret) {
+		free_irq(spi->irq, net);
+		if (pdata->transceiver_enable)
+			pdata->transceiver_enable(0);
+		return ret;
+	}
+	mcp251x_set_normal_mode(spi);
+	netif_wake_queue(net);
+
+	return 0;
+}
+
+static int mcp251x_stop(struct net_device *net)
+{
+	struct mcp251x_priv *priv = netdev_priv(net);
+	struct spi_device *spi = priv->spi;
+	struct mcp251x_platform_data *pdata = spi->dev.platform_data;
+
+	close_candev(net);
+
+	/* Disable and clear pending interrupts */
+	mcp251x_write_reg(spi, CANINTE, 0x00);
+	mcp251x_write_reg(spi, CANINTF, 0x00);
+
+	priv->force_quit = 1;
+	free_irq(spi->irq, net);
+	flush_workqueue(priv->wq);
+
+	mcp251x_write_reg(spi, TXBCTRL(0), 0);
+	if (priv->tx_skb || priv->tx_len)
+		mcp251x_clean(net);
+
+	mcp251x_hw_sleep(spi);
+
+	if (pdata->transceiver_enable)
+		pdata->transceiver_enable(0);
+
+	priv->can.state = CAN_STATE_STOPPED;
+
+	return 0;
+}
+
+static void mcp251x_tx_work_handler(struct work_struct *ws)
+{
+	struct mcp251x_priv *priv = container_of(ws, struct mcp251x_priv,
+						 tx_work);
+	struct spi_device *spi = priv->spi;
+	struct net_device *net = priv->net;
+	struct can_frame *frame;
+
+	if (priv->tx_skb) {
+		frame = (struct can_frame *)priv->tx_skb->data;
+
+		if (priv->can.state == CAN_STATE_BUS_OFF) {
+			mcp251x_clean(net);
+			netif_wake_queue(net);
+			return;
+		}
+		if (frame->can_dlc > CAN_FRAME_MAX_DATA_LEN)
+			frame->can_dlc = CAN_FRAME_MAX_DATA_LEN;
+		mcp251x_hw_tx(spi, frame, 0);
+		priv->tx_len = 1 + frame->can_dlc;
+		can_put_echo_skb(priv->tx_skb, net, 0);
+		priv->tx_skb = NULL;
+	}
+}
+
+static void mcp251x_irq_work_handler(struct work_struct *ws)
+{
+	struct mcp251x_priv *priv = container_of(ws, struct mcp251x_priv,
+						 irq_work);
+	struct spi_device *spi = priv->spi;
+	struct net_device *net = priv->net;
+	u8 txbnctrl;
+	u8 intf;
+	enum can_state new_state;
+
+	if (priv->after_suspend) {
+		mdelay(10);
+		mcp251x_hw_reset(spi);
+		mcp251x_setup(net, priv, spi);
+		if (priv->after_suspend & AFTER_SUSPEND_RESTART) {
+			mcp251x_set_normal_mode(spi);
+		} else if (priv->after_suspend & AFTER_SUSPEND_UP) {
+			netif_device_attach(net);
+			/* Clean since we lost tx buffer */
+			if (priv->tx_skb || priv->tx_len) {
+				mcp251x_clean(net);
+				netif_wake_queue(net);
+			}
+			mcp251x_set_normal_mode(spi);
+		} else {
+			mcp251x_hw_sleep(spi);
+		}
+		priv->after_suspend = 0;
+	}
+
+	if (priv->can.restart_ms == 0 && priv->can.state == CAN_STATE_BUS_OFF)
+		return;
+
+	while (!priv->force_quit && !freezing(current)) {
+		u8 eflag = mcp251x_read_reg(spi, EFLG);
+		int can_id = 0, data1 = 0;
+
+		mcp251x_write_reg(spi, EFLG, 0x00);
+
+		if (priv->restart_tx) {
+			priv->restart_tx = 0;
+			mcp251x_write_reg(spi, TXBCTRL(0), 0);
+			if (priv->tx_skb || priv->tx_len)
+				mcp251x_clean(net);
+			netif_wake_queue(net);
+			can_id |= CAN_ERR_RESTARTED;
+		}
+
+		if (priv->wake) {
+			/* Wait whilst the device wakes up */
+			mdelay(10);
+			priv->wake = 0;
+		}
+
+		intf = mcp251x_read_reg(spi, CANINTF);
+		mcp251x_write_bits(spi, CANINTF, intf, 0x00);
+
+		/* Update can state */
+		if (eflag & EFLG_TXBO) {
+			new_state = CAN_STATE_BUS_OFF;
+			can_id |= CAN_ERR_BUSOFF;
+		} else if (eflag & EFLG_TXEP) {
+			new_state = CAN_STATE_ERROR_PASSIVE;
+			can_id |= CAN_ERR_CRTL;
+			data1 |= CAN_ERR_CRTL_TX_PASSIVE;
+		} else if (eflag & EFLG_RXEP) {
+			new_state = CAN_STATE_ERROR_PASSIVE;
+			can_id |= CAN_ERR_CRTL;
+			data1 |= CAN_ERR_CRTL_RX_PASSIVE;
+		} else if (eflag & EFLG_TXWAR) {
+			new_state = CAN_STATE_ERROR_WARNING;
+			can_id |= CAN_ERR_CRTL;
+			data1 |= CAN_ERR_CRTL_TX_WARNING;
+		} else if (eflag & EFLG_RXWAR) {
+			new_state = CAN_STATE_ERROR_WARNING;
+			can_id |= CAN_ERR_CRTL;
+			data1 |= CAN_ERR_CRTL_RX_WARNING;
+		} else {
+			new_state = CAN_STATE_ERROR_ACTIVE;
+		}
+
+		/* Update can state statistics */
+		switch (priv->can.state) {
+		case CAN_STATE_ERROR_ACTIVE:
+			if (new_state >= CAN_STATE_ERROR_WARNING &&
+			    new_state <= CAN_STATE_BUS_OFF)
+				priv->can.can_stats.error_warning++;
+		case CAN_STATE_ERROR_WARNING:	/* fallthrough */
+			if (new_state >= CAN_STATE_ERROR_PASSIVE &&
+			    new_state <= CAN_STATE_BUS_OFF)
+				priv->can.can_stats.error_passive++;
+			break;
+		default:
+			break;
+		}
+		priv->can.state = new_state;
+
+		if ((intf & CANINTF_ERRIF) || (can_id & CAN_ERR_RESTARTED)) {
+			struct sk_buff *skb;
+			struct can_frame *frame;
+
+			/* Create error frame */
+			skb = alloc_can_err_skb(net, &frame);
+			if (skb) {
+				/* Set error frame flags based on bus state */
+				frame->can_id = can_id;
+				frame->data[1] = data1;
+
+				/* Update net stats for overflows */
+				if (eflag & (EFLG_RX0OVR | EFLG_RX1OVR)) {
+					if (eflag & EFLG_RX0OVR)
+						net->stats.rx_over_errors++;
+					if (eflag & EFLG_RX1OVR)
+						net->stats.rx_over_errors++;
+					frame->can_id |= CAN_ERR_CRTL;
+					frame->data[1] |=
+						CAN_ERR_CRTL_RX_OVERFLOW;
+				}
+
+				netif_rx(skb);
+			} else {
+				dev_info(&spi->dev,
+					 "cannot allocate error skb\n");
+			}
+		}
+
+		if (priv->can.state == CAN_STATE_BUS_OFF) {
+			if (priv->can.restart_ms == 0) {
+				can_bus_off(net);
+				mcp251x_hw_sleep(spi);
+				return;
+			}
+		}
+
+		if (intf == 0)
+			break;
+
+		if (intf & CANINTF_WAKIF)
+			complete(&priv->awake);
+
+		if (intf & CANINTF_MERRF) {
+			/* If there are pending Tx buffers, restart queue */
+			txbnctrl = mcp251x_read_reg(spi, TXBCTRL(0));
+			if (!(txbnctrl & TXBCTRL_TXREQ)) {
+				if (priv->tx_skb || priv->tx_len)
+					mcp251x_clean(net);
+				netif_wake_queue(net);
+			}
+		}
+
+		if (intf & (CANINTF_TX2IF | CANINTF_TX1IF | CANINTF_TX0IF)) {
+			net->stats.tx_packets++;
+			net->stats.tx_bytes += priv->tx_len - 1;
+			if (priv->tx_len) {
+				can_get_echo_skb(net, 0);
+				priv->tx_len = 0;
+			}
+			netif_wake_queue(net);
+		}
+
+		if (intf & CANINTF_RX0IF)
+			mcp251x_hw_rx(spi, 0);
+
+		if (intf & CANINTF_RX1IF)
+			mcp251x_hw_rx(spi, 1);
+	}
+}
+
+static const struct net_device_ops mcp251x_netdev_ops = {
+	.ndo_open = mcp251x_open,
+	.ndo_stop = mcp251x_stop,
+	.ndo_start_xmit = mcp251x_hard_start_xmit,
+};
+
+static int __devinit mcp251x_can_probe(struct spi_device *spi)
+{
+	struct net_device *net;
+	struct mcp251x_priv *priv;
+	struct mcp251x_platform_data *pdata = spi->dev.platform_data;
+	int ret = -ENODEV;
+
+	if (!pdata)
+		/* Platform data is required for osc freq */
+		goto error_out;
+
+	/* Allocate can/net device */
+	net = alloc_candev(sizeof(struct mcp251x_priv), TX_ECHO_SKB_MAX);
+	if (!net) {
+		ret = -ENOMEM;
+		goto error_alloc;
+	}
+
+	net->netdev_ops = &mcp251x_netdev_ops;
+	net->flags |= IFF_ECHO;
+
+	priv = netdev_priv(net);
+	priv->can.bittiming_const = &mcp251x_bittiming_const;
+	priv->can.do_set_mode = mcp251x_do_set_mode;
+	priv->can.clock.freq = pdata->oscillator_frequency / 2;
+	priv->can.do_set_bittiming = mcp251x_do_set_bittiming;
+	priv->net = net;
+	dev_set_drvdata(&spi->dev, priv);
+
+	priv->spi = spi;
+	mutex_init(&priv->spi_lock);
+
+	/* If requested, allocate DMA buffers */
+	if (mcp251x_enable_dma) {
+		spi->dev.coherent_dma_mask = ~0;
+
+		/*
+		 * Minimum coherent DMA allocation is PAGE_SIZE, so allocate
+		 * that much and share it between Tx and Rx DMA buffers.
+		 */
+		priv->spi_tx_buf = dma_alloc_coherent(&spi->dev,
+						      PAGE_SIZE,
+						      &priv->spi_tx_dma,
+						      GFP_DMA);
+
+		if (priv->spi_tx_buf) {
+			priv->spi_rx_buf = (u8 *)(priv->spi_tx_buf +
+						  (PAGE_SIZE / 2));
+			priv->spi_rx_dma = (dma_addr_t)(priv->spi_tx_dma +
+							(PAGE_SIZE / 2));
+		} else {
+			/* Fall back to non-DMA */
+			mcp251x_enable_dma = 0;
+		}
+	}
+
+	/* Allocate non-DMA buffers */
+	if (!mcp251x_enable_dma) {
+		priv->spi_tx_buf = kmalloc(SPI_TRANSFER_BUF_LEN, GFP_KERNEL);
+		if (!priv->spi_tx_buf) {
+			ret = -ENOMEM;
+			goto error_tx_buf;
+		}
+		priv->spi_rx_buf = kmalloc(SPI_TRANSFER_BUF_LEN, GFP_KERNEL);
+		if (!priv->spi_tx_buf) {
+			ret = -ENOMEM;
+			goto error_rx_buf;
+		}
+	}
+
+	if (pdata->power_enable)
+		pdata->power_enable(1);
+
+	/* Call out to platform specific setup */
+	if (pdata->board_specific_setup)
+		pdata->board_specific_setup(spi);
+
+	SET_NETDEV_DEV(net, &spi->dev);
+
+	priv->wq = create_freezeable_workqueue("mcp251x_wq");
+
+	INIT_WORK(&priv->tx_work, mcp251x_tx_work_handler);
+	INIT_WORK(&priv->irq_work, mcp251x_irq_work_handler);
+
+	init_completion(&priv->awake);
+
+	/* Configure the SPI bus */
+	spi->mode = SPI_MODE_0;
+	spi->bits_per_word = 8;
+	spi_setup(spi);
+
+	if (!mcp251x_hw_probe(spi)) {
+		dev_info(&spi->dev, "Probe failed\n");
+		goto error_probe;
+	}
+	mcp251x_hw_sleep(spi);
+
+	if (pdata->transceiver_enable)
+		pdata->transceiver_enable(0);
+
+	ret = register_candev(net);
+	if (!ret) {
+		dev_info(&spi->dev, "probed\n");
+		return ret;
+	}
+error_probe:
+	if (!mcp251x_enable_dma)
+		kfree(priv->spi_rx_buf);
+error_rx_buf:
+	if (!mcp251x_enable_dma)
+		kfree(priv->spi_tx_buf);
+error_tx_buf:
+	free_candev(net);
+	if (mcp251x_enable_dma)
+		dma_free_coherent(&spi->dev, PAGE_SIZE,
+				  priv->spi_tx_buf, priv->spi_tx_dma);
+error_alloc:
+	if (pdata->power_enable)
+		pdata->power_enable(0);
+	dev_err(&spi->dev, "probe failed\n");
+error_out:
+	return ret;
+}
+
+static int __devexit mcp251x_can_remove(struct spi_device *spi)
+{
+	struct mcp251x_platform_data *pdata = spi->dev.platform_data;
+	struct mcp251x_priv *priv = dev_get_drvdata(&spi->dev);
+	struct net_device *net = priv->net;
+
+	unregister_candev(net);
+	free_candev(net);
+
+	priv->force_quit = 1;
+	flush_workqueue(priv->wq);
+	destroy_workqueue(priv->wq);
+
+	if (mcp251x_enable_dma) {
+		dma_free_coherent(&spi->dev, PAGE_SIZE,
+				  priv->spi_tx_buf, priv->spi_tx_dma);
+	} else {
+		kfree(priv->spi_tx_buf);
+		kfree(priv->spi_rx_buf);
+	}
+
+	if (pdata->power_enable)
+		pdata->power_enable(0);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int mcp251x_can_suspend(struct spi_device *spi, pm_message_t state)
+{
+	struct mcp251x_platform_data *pdata = spi->dev.platform_data;
+	struct mcp251x_priv *priv = dev_get_drvdata(&spi->dev);
+	struct net_device *net = priv->net;
+
+	if (netif_running(net)) {
+		netif_device_detach(net);
+
+		mcp251x_hw_sleep(spi);
+		if (pdata->transceiver_enable)
+			pdata->transceiver_enable(0);
+		priv->after_suspend = AFTER_SUSPEND_UP;
+	} else {
+		priv->after_suspend = AFTER_SUSPEND_DOWN;
+	}
+
+	if (pdata->power_enable) {
+		pdata->power_enable(0);
+		priv->after_suspend |= AFTER_SUSPEND_POWER;
+	}
+
+	return 0;
+}
+
+static int mcp251x_can_resume(struct spi_device *spi)
+{
+	struct mcp251x_platform_data *pdata = spi->dev.platform_data;
+	struct mcp251x_priv *priv = dev_get_drvdata(&spi->dev);
+
+	if (priv->after_suspend & AFTER_SUSPEND_POWER) {
+		pdata->power_enable(1);
+		queue_work(priv->wq, &priv->irq_work);
+	} else {
+		if (priv->after_suspend & AFTER_SUSPEND_UP) {
+			if (pdata->transceiver_enable)
+				pdata->transceiver_enable(1);
+			queue_work(priv->wq, &priv->irq_work);
+		} else {
+			priv->after_suspend = 0;
+		}
+	}
+	return 0;
+}
+#else
+#define mcp251x_can_suspend NULL
+#define mcp251x_can_resume NULL
+#endif
+
+static struct spi_driver mcp251x_can_driver = {
+	.driver = {
+		.name = DEVICE_NAME,
+		.bus = &spi_bus_type,
+		.owner = THIS_MODULE,
+	},
+
+	.probe = mcp251x_can_probe,
+	.remove = __devexit_p(mcp251x_can_remove),
+	.suspend = mcp251x_can_suspend,
+	.resume = mcp251x_can_resume,
+};
+
+static int __init mcp251x_can_init(void)
+{
+	return spi_register_driver(&mcp251x_can_driver);
+}
+
+static void __exit mcp251x_can_exit(void)
+{
+	spi_unregister_driver(&mcp251x_can_driver);
+}
+
+module_init(mcp251x_can_init);
+module_exit(mcp251x_can_exit);
+
+MODULE_AUTHOR("Chris Elston <celston@katalix.com>, "
+	      "Christian Pellegrin <chripell@evolware.org>");
+MODULE_DESCRIPTION("Microchip 251x CAN driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/can/platform/mcp251x.h b/include/linux/can/platform/mcp251x.h
new file mode 100644
index 000000000000..1448177d86d5
--- /dev/null
+++ b/include/linux/can/platform/mcp251x.h
@@ -0,0 +1,36 @@
+#ifndef __CAN_PLATFORM_MCP251X_H__
+#define __CAN_PLATFORM_MCP251X_H__
+
+/*
+ *
+ * CAN bus driver for Microchip 251x CAN Controller with SPI Interface
+ *
+ */
+
+#include <linux/spi/spi.h>
+
+/**
+ * struct mcp251x_platform_data - MCP251X SPI CAN controller platform data
+ * @oscillator_frequency:       - oscillator frequency in Hz
+ * @model:                      - actual type of chip
+ * @board_specific_setup:       - called before probing the chip (power,reset)
+ * @transceiver_enable:         - called to power on/off the transceiver
+ * @power_enable:               - called to power on/off the mcp *and* the
+ *                                transceiver
+ *
+ * Please note that you should define power_enable or transceiver_enable or
+ * none of them. Defining both of them is no use.
+ *
+ */
+
+struct mcp251x_platform_data {
+	unsigned long oscillator_frequency;
+	int model;
+#define CAN_MCP251X_MCP2510 0
+#define CAN_MCP251X_MCP2515 1
+	int (*board_specific_setup)(struct spi_device *spi);
+	int (*transceiver_enable)(int enable);
+	int (*power_enable) (int enable);
+};
+
+#endif /* __CAN_PLATFORM_MCP251X_H__ */
-- 
cgit v1.2.3


From 24f1e32c60c45c89a997c73395b69c8af6f0a84e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 9 Sep 2009 19:22:48 +0200
Subject: hw-breakpoints: Rewrite the hw-breakpoints layer on top of perf
 events

This patch rebase the implementation of the breakpoints API on top of
perf events instances.

Each breakpoints are now perf events that handle the
register scheduling, thread/cpu attachment, etc..

The new layering is now made as follows:

       ptrace       kgdb      ftrace   perf syscall
          \          |          /         /
           \         |         /         /
                                        /
            Core breakpoint API        /
                                      /
                     |               /
                     |              /

              Breakpoints perf events

                     |
                     |

               Breakpoints PMU ---- Debug Register constraints handling
                                    (Part of core breakpoint API)
                     |
                     |

             Hardware debug registers

Reasons of this rewrite:

- Use the centralized/optimized pmu registers scheduling,
  implying an easier arch integration
- More powerful register handling: perf attributes (pinned/flexible
  events, exclusive/non-exclusive, tunable period, etc...)

Impact:

- New perf ABI: the hardware breakpoints counters
- Ptrace breakpoints setting remains tricky and still needs some per
  thread breakpoints references.

Todo (in the order):

- Support breakpoints perf counter events for perf tools (ie: implement
  perf_bpcounter_event())
- Support from perf tools

Changes in v2:

- Follow the perf "event " rename
- The ptrace regression have been fixed (ptrace breakpoint perf events
  weren't released when a task ended)
- Drop the struct hw_breakpoint and store generic fields in
  perf_event_attr.
- Separate core and arch specific headers, drop
  asm-generic/hw_breakpoint.h and create linux/hw_breakpoint.h
- Use new generic len/type for breakpoint
- Handle off case: when breakpoints api is not supported by an arch

Changes in v3:

- Fix broken CONFIG_KVM, we need to propagate the breakpoint api
  changes to kvm when we exit the guest and restore the bp registers
  to the host.

Changes in v4:

- Drop the hw_breakpoint_restore() stub as it is only used by KVM
- EXPORT_SYMBOL_GPL hw_breakpoint_restore() as KVM can be built as a
  module
- Restore the breakpoints unconditionally on kvm guest exit:
  TIF_DEBUG_THREAD doesn't anymore cover every cases of running
  breakpoints and vcpu->arch.switch_db_regs might not always be
  set when the guest used debug registers.
  (Waiting for a reliable optimization)

Changes in v5:

- Split-up the asm-generic/hw-breakpoint.h moving to
  linux/hw_breakpoint.h into a separate patch
- Optimize the breakpoints restoring while switching from kvm guest
  to host. We only want to restore the state if we have active
  breakpoints to the host, otherwise we don't care about messed-up
  address registers.
- Add asm/hw_breakpoint.h to Kbuild
- Fix bad breakpoint type in trace_selftest.c

Changes in v6:

- Fix wrong header inclusion in trace.h (triggered a build
  error with CONFIG_FTRACE_SELFTEST

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jan Kiszka <jan.kiszka@web.de>
Cc: Jiri Slaby <jirislaby@gmail.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Paul Mundt <lethal@linux-sh.org>
---
 arch/Kconfig                         |   3 +
 arch/x86/include/asm/Kbuild          |   1 +
 arch/x86/include/asm/debugreg.h      |  11 +-
 arch/x86/include/asm/hw_breakpoint.h |  58 +++--
 arch/x86/include/asm/processor.h     |  12 +-
 arch/x86/kernel/hw_breakpoint.c      | 391 +++++++++++++++++++++-----------
 arch/x86/kernel/process.c            |   7 +-
 arch/x86/kernel/process_32.c         |  26 +--
 arch/x86/kernel/process_64.c         |  26 +--
 arch/x86/kernel/ptrace.c             | 182 ++++++++++-----
 arch/x86/kernel/smpboot.c            |   3 -
 arch/x86/kvm/x86.c                   |  18 +-
 arch/x86/power/cpu.c                 |   6 -
 include/linux/hw_breakpoint.h        | 243 ++++++++++----------
 include/linux/perf_event.h           |  26 ++-
 kernel/exit.c                        |   5 +
 kernel/hw_breakpoint.c               | 424 ++++++++++++++---------------------
 kernel/perf_event.c                  |  53 ++++-
 kernel/trace/trace.h                 |   5 +-
 kernel/trace/trace_entries.h         |   6 +-
 kernel/trace/trace_ksym.c            | 126 +++++------
 kernel/trace/trace_selftest.c        |   3 +-
 22 files changed, 885 insertions(+), 750 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index acb664397945..eef3bbb97075 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -128,6 +128,9 @@ config HAVE_DEFAULT_NO_SPIN_MUTEXES
 
 config HAVE_HW_BREAKPOINT
 	bool
+	depends on HAVE_PERF_EVENTS
+	select ANON_INODES
+	select PERF_EVENTS
 
 
 source "kernel/gcov/Kconfig"
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 4a8e80cdcfa5..9f828f87ca35 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -10,6 +10,7 @@ header-y += ptrace-abi.h
 header-y += sigcontext32.h
 header-y += ucontext.h
 header-y += processor-flags.h
+header-y += hw_breakpoint.h
 
 unifdef-y += e820.h
 unifdef-y += ist.h
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 23439fbb1d0e..9a3333c91f9a 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -75,13 +75,8 @@
  */
 #ifdef __KERNEL__
 
-/* For process management */
-extern void flush_thread_hw_breakpoint(struct task_struct *tsk);
-extern int copy_thread_hw_breakpoint(struct task_struct *tsk,
-		struct task_struct *child, unsigned long clone_flags);
+DECLARE_PER_CPU(unsigned long, dr7);
 
-/* For CPU management */
-extern void load_debug_registers(void);
 static inline void hw_breakpoint_disable(void)
 {
 	/* Zero the control register for HW Breakpoint */
@@ -94,6 +89,10 @@ static inline void hw_breakpoint_disable(void)
 	set_debugreg(0UL, 3);
 }
 
+#ifdef CONFIG_KVM
+extern void hw_breakpoint_restore(void);
+#endif
+
 #endif	/* __KERNEL__ */
 
 #endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
index 3cfca8e2b5f6..0675a7c4c20e 100644
--- a/arch/x86/include/asm/hw_breakpoint.h
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -4,6 +4,11 @@
 #ifdef	__KERNEL__
 #define	__ARCH_HW_BREAKPOINT_H
 
+/*
+ * The name should probably be something dealt in
+ * a higher level. While dealing with the user
+ * (display/resolving)
+ */
 struct arch_hw_breakpoint {
 	char		*name; /* Contains name of the symbol to set bkpt */
 	unsigned long	address;
@@ -12,44 +17,57 @@ struct arch_hw_breakpoint {
 };
 
 #include <linux/kdebug.h>
-#include <linux/hw_breakpoint.h>
+#include <linux/percpu.h>
+#include <linux/list.h>
 
 /* Available HW breakpoint length encodings */
-#define HW_BREAKPOINT_LEN_1		0x40
-#define HW_BREAKPOINT_LEN_2		0x44
-#define HW_BREAKPOINT_LEN_4		0x4c
-#define HW_BREAKPOINT_LEN_EXECUTE	0x40
+#define X86_BREAKPOINT_LEN_1		0x40
+#define X86_BREAKPOINT_LEN_2		0x44
+#define X86_BREAKPOINT_LEN_4		0x4c
+#define X86_BREAKPOINT_LEN_EXECUTE	0x40
 
 #ifdef CONFIG_X86_64
-#define HW_BREAKPOINT_LEN_8		0x48
+#define X86_BREAKPOINT_LEN_8		0x48
 #endif
 
 /* Available HW breakpoint type encodings */
 
 /* trigger on instruction execute */
-#define HW_BREAKPOINT_EXECUTE	0x80
+#define X86_BREAKPOINT_EXECUTE	0x80
 /* trigger on memory write */
-#define HW_BREAKPOINT_WRITE	0x81
+#define X86_BREAKPOINT_WRITE	0x81
 /* trigger on memory read or write */
-#define HW_BREAKPOINT_RW	0x83
+#define X86_BREAKPOINT_RW	0x83
 
 /* Total number of available HW breakpoint registers */
 #define HBP_NUM 4
 
-extern struct hw_breakpoint *hbp_kernel[HBP_NUM];
-DECLARE_PER_CPU(struct hw_breakpoint*, this_hbp_kernel[HBP_NUM]);
-extern unsigned int hbp_user_refcount[HBP_NUM];
+struct perf_event;
+struct pmu;
 
-extern void arch_install_thread_hw_breakpoint(struct task_struct *tsk);
-extern void arch_uninstall_thread_hw_breakpoint(void);
 extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len);
-extern int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
-						struct task_struct *tsk);
-extern void arch_update_user_hw_breakpoint(int pos, struct task_struct *tsk);
-extern void arch_flush_thread_hw_breakpoint(struct task_struct *tsk);
-extern void arch_update_kernel_hw_breakpoint(void *);
+extern int arch_validate_hwbkpt_settings(struct perf_event *bp,
+					 struct task_struct *tsk);
 extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
-				     unsigned long val, void *data);
+					   unsigned long val, void *data);
+
+
+int arch_install_hw_breakpoint(struct perf_event *bp);
+void arch_uninstall_hw_breakpoint(struct perf_event *bp);
+void hw_breakpoint_pmu_read(struct perf_event *bp);
+void hw_breakpoint_pmu_unthrottle(struct perf_event *bp);
+
+extern void
+arch_fill_perf_breakpoint(struct perf_event *bp);
+
+unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type);
+int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type);
+
+extern int arch_bp_generic_fields(int x86_len, int x86_type,
+				  int *gen_len, int *gen_type);
+
+extern struct pmu perf_ops_bp;
+
 #endif	/* __KERNEL__ */
 #endif	/* _I386_HW_BREAKPOINT_H */
 
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 61aafb71c7ef..820f3000f736 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -423,6 +423,8 @@ extern unsigned int xstate_size;
 extern void free_thread_xstate(struct task_struct *);
 extern struct kmem_cache *task_xstate_cachep;
 
+struct perf_event;
+
 struct thread_struct {
 	/* Cached TLS descriptors: */
 	struct desc_struct	tls_array[GDT_ENTRY_TLS_ENTRIES];
@@ -444,12 +446,10 @@ struct thread_struct {
 	unsigned long		fs;
 #endif
 	unsigned long		gs;
-	/* Hardware debugging registers: */
-	unsigned long		debugreg[HBP_NUM];
-	unsigned long		debugreg6;
-	unsigned long		debugreg7;
-	/* Hardware breakpoint info */
-	struct hw_breakpoint	*hbp[HBP_NUM];
+	/* Save middle states of ptrace breakpoints */
+	struct perf_event	*ptrace_bps[HBP_NUM];
+	/* Debug status used for traps, single steps, etc... */
+	unsigned long           debugreg6;
 	/* Fault info: */
 	unsigned long		cr2;
 	unsigned long		trap_no;
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 9316a9de4de3..e622620790bd 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -15,6 +15,7 @@
  *
  * Copyright (C) 2007 Alan Stern
  * Copyright (C) 2009 IBM Corporation
+ * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com>
  */
 
 /*
@@ -22,6 +23,8 @@
  * using the CPU's debug registers.
  */
 
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
 #include <linux/irqflags.h>
 #include <linux/notifier.h>
 #include <linux/kallsyms.h>
@@ -38,26 +41,24 @@
 #include <asm/processor.h>
 #include <asm/debugreg.h>
 
-/* Unmasked kernel DR7 value */
-static unsigned long kdr7;
+/* Per cpu debug control register value */
+DEFINE_PER_CPU(unsigned long, dr7);
+
+/* Per cpu debug address registers values */
+static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]);
 
 /*
- * Masks for the bits corresponding to registers DR0 - DR3 in DR7 register.
- * Used to clear and verify the status of bits corresponding to DR0 - DR3
+ * Stores the breakpoints currently in use on each breakpoint address
+ * register for each cpus
  */
-static const unsigned long	dr7_masks[HBP_NUM] = {
-	0x000f0003,	/* LEN0, R/W0, G0, L0 */
-	0x00f0000c,	/* LEN1, R/W1, G1, L1 */
-	0x0f000030,	/* LEN2, R/W2, G2, L2 */
-	0xf00000c0	/* LEN3, R/W3, G3, L3 */
-};
+static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]);
 
 
 /*
  * Encode the length, type, Exact, and Enable bits for a particular breakpoint
  * as stored in debug register 7.
  */
-static unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
+unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
 {
 	unsigned long bp_info;
 
@@ -68,64 +69,89 @@ static unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
 	return bp_info;
 }
 
-void arch_update_kernel_hw_breakpoint(void *unused)
+/*
+ * Decode the length and type bits for a particular breakpoint as
+ * stored in debug register 7.  Return the "enabled" status.
+ */
+int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type)
 {
-	struct hw_breakpoint *bp;
-	int i, cpu = get_cpu();
-	unsigned long temp_kdr7 = 0;
-
-	/* Don't allow debug exceptions while we update the registers */
-	set_debugreg(0UL, 7);
+	int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
 
-	for (i = hbp_kernel_pos; i < HBP_NUM; i++) {
-		per_cpu(this_hbp_kernel[i], cpu) = bp = hbp_kernel[i];
-		if (bp) {
-			temp_kdr7 |= encode_dr7(i, bp->info.len, bp->info.type);
-			set_debugreg(bp->info.address, i);
-		}
-	}
+	*len = (bp_info & 0xc) | 0x40;
+	*type = (bp_info & 0x3) | 0x80;
 
-	/* No need to set DR6. Update the debug registers with kernel-space
-	 * breakpoint values from kdr7 and user-space requests from the
-	 * current process
-	 */
-	kdr7 = temp_kdr7;
-	set_debugreg(kdr7 | current->thread.debugreg7, 7);
-	put_cpu();
+	return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
 }
 
 /*
- * Install the thread breakpoints in their debug registers.
+ * Install a perf counter breakpoint.
+ *
+ * We seek a free debug address register and use it for this
+ * breakpoint. Eventually we enable it in the debug control register.
+ *
+ * Atomic: we hold the counter->ctx->lock and we only handle variables
+ * and registers local to this cpu.
  */
-void arch_install_thread_hw_breakpoint(struct task_struct *tsk)
+int arch_install_hw_breakpoint(struct perf_event *bp)
 {
-	struct thread_struct *thread = &(tsk->thread);
-
-	switch (hbp_kernel_pos) {
-	case 4:
-		set_debugreg(thread->debugreg[3], 3);
-	case 3:
-		set_debugreg(thread->debugreg[2], 2);
-	case 2:
-		set_debugreg(thread->debugreg[1], 1);
-	case 1:
-		set_debugreg(thread->debugreg[0], 0);
-	default:
-		break;
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+	unsigned long *dr7;
+	int i;
+
+	for (i = 0; i < HBP_NUM; i++) {
+		struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
+
+		if (!*slot) {
+			*slot = bp;
+			break;
+		}
 	}
 
-	/* No need to set DR6 */
-	set_debugreg((kdr7 | thread->debugreg7), 7);
+	if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
+		return -EBUSY;
+
+	set_debugreg(info->address, i);
+	__get_cpu_var(cpu_debugreg[i]) = info->address;
+
+	dr7 = &__get_cpu_var(dr7);
+	*dr7 |= encode_dr7(i, info->len, info->type);
+
+	set_debugreg(*dr7, 7);
+
+	return 0;
 }
 
 /*
- * Install the debug register values for just the kernel, no thread.
+ * Uninstall the breakpoint contained in the given counter.
+ *
+ * First we search the debug address register it uses and then we disable
+ * it.
+ *
+ * Atomic: we hold the counter->ctx->lock and we only handle variables
+ * and registers local to this cpu.
  */
-void arch_uninstall_thread_hw_breakpoint(void)
+void arch_uninstall_hw_breakpoint(struct perf_event *bp)
 {
-	/* Clear the user-space portion of debugreg7 by setting only kdr7 */
-	set_debugreg(kdr7, 7);
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+	unsigned long *dr7;
+	int i;
+
+	for (i = 0; i < HBP_NUM; i++) {
+		struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
+
+		if (*slot == bp) {
+			*slot = NULL;
+			break;
+		}
+	}
+
+	if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
+		return;
 
+	dr7 = &__get_cpu_var(dr7);
+	*dr7 &= ~encode_dr7(i, info->len, info->type);
+
+	set_debugreg(*dr7, 7);
 }
 
 static int get_hbp_len(u8 hbp_len)
@@ -133,17 +159,17 @@ static int get_hbp_len(u8 hbp_len)
 	unsigned int len_in_bytes = 0;
 
 	switch (hbp_len) {
-	case HW_BREAKPOINT_LEN_1:
+	case X86_BREAKPOINT_LEN_1:
 		len_in_bytes = 1;
 		break;
-	case HW_BREAKPOINT_LEN_2:
+	case X86_BREAKPOINT_LEN_2:
 		len_in_bytes = 2;
 		break;
-	case HW_BREAKPOINT_LEN_4:
+	case X86_BREAKPOINT_LEN_4:
 		len_in_bytes = 4;
 		break;
 #ifdef CONFIG_X86_64
-	case HW_BREAKPOINT_LEN_8:
+	case X86_BREAKPOINT_LEN_8:
 		len_in_bytes = 8;
 		break;
 #endif
@@ -178,67 +204,146 @@ static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
 /*
  * Store a breakpoint's encoded address, length, and type.
  */
-static int arch_store_info(struct hw_breakpoint *bp, struct task_struct *tsk)
+static int arch_store_info(struct perf_event *bp)
 {
-	/*
-	 * User-space requests will always have the address field populated
-	 * Symbol names from user-space are rejected
-	 */
-	if (tsk && bp->info.name)
-		return -EINVAL;
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
 	/*
 	 * For kernel-addresses, either the address or symbol name can be
 	 * specified.
 	 */
-	if (bp->info.name)
-		bp->info.address = (unsigned long)
-					kallsyms_lookup_name(bp->info.name);
-	if (bp->info.address)
+	if (info->name)
+		info->address = (unsigned long)
+				kallsyms_lookup_name(info->name);
+	if (info->address)
 		return 0;
+
 	return -EINVAL;
 }
 
-/*
- * Validate the arch-specific HW Breakpoint register settings
- */
-int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
-						struct task_struct *tsk)
+int arch_bp_generic_fields(int x86_len, int x86_type,
+			   int *gen_len, int *gen_type)
 {
-	unsigned int align;
-	int ret = -EINVAL;
+	/* Len */
+	switch (x86_len) {
+	case X86_BREAKPOINT_LEN_1:
+		*gen_len = HW_BREAKPOINT_LEN_1;
+		break;
+	case X86_BREAKPOINT_LEN_2:
+		*gen_len = HW_BREAKPOINT_LEN_2;
+		break;
+	case X86_BREAKPOINT_LEN_4:
+		*gen_len = HW_BREAKPOINT_LEN_4;
+		break;
+#ifdef CONFIG_X86_64
+	case X86_BREAKPOINT_LEN_8:
+		*gen_len = HW_BREAKPOINT_LEN_8;
+		break;
+#endif
+	default:
+		return -EINVAL;
+	}
 
-	switch (bp->info.type) {
-	/*
-	 * Ptrace-refactoring code
-	 * For now, we'll allow instruction breakpoint only for user-space
-	 * addresses
-	 */
-	case HW_BREAKPOINT_EXECUTE:
-		if ((!arch_check_va_in_userspace(bp->info.address,
-							bp->info.len)) &&
-			bp->info.len != HW_BREAKPOINT_LEN_EXECUTE)
-			return ret;
+	/* Type */
+	switch (x86_type) {
+	case X86_BREAKPOINT_EXECUTE:
+		*gen_type = HW_BREAKPOINT_X;
 		break;
-	case HW_BREAKPOINT_WRITE:
+	case X86_BREAKPOINT_WRITE:
+		*gen_type = HW_BREAKPOINT_W;
 		break;
-	case HW_BREAKPOINT_RW:
+	case X86_BREAKPOINT_RW:
+		*gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
 		break;
 	default:
-		return ret;
+		return -EINVAL;
 	}
 
-	switch (bp->info.len) {
+	return 0;
+}
+
+
+static int arch_build_bp_info(struct perf_event *bp)
+{
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+
+	info->address = bp->attr.bp_addr;
+
+	/* Len */
+	switch (bp->attr.bp_len) {
 	case HW_BREAKPOINT_LEN_1:
-		align = 0;
+		info->len = X86_BREAKPOINT_LEN_1;
 		break;
 	case HW_BREAKPOINT_LEN_2:
-		align = 1;
+		info->len = X86_BREAKPOINT_LEN_2;
 		break;
 	case HW_BREAKPOINT_LEN_4:
-		align = 3;
+		info->len = X86_BREAKPOINT_LEN_4;
 		break;
 #ifdef CONFIG_X86_64
 	case HW_BREAKPOINT_LEN_8:
+		info->len = X86_BREAKPOINT_LEN_8;
+		break;
+#endif
+	default:
+		return -EINVAL;
+	}
+
+	/* Type */
+	switch (bp->attr.bp_type) {
+	case HW_BREAKPOINT_W:
+		info->type = X86_BREAKPOINT_WRITE;
+		break;
+	case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
+		info->type = X86_BREAKPOINT_RW;
+		break;
+	case HW_BREAKPOINT_X:
+		info->type = X86_BREAKPOINT_EXECUTE;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+/*
+ * Validate the arch-specific HW Breakpoint register settings
+ */
+int arch_validate_hwbkpt_settings(struct perf_event *bp,
+				  struct task_struct *tsk)
+{
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+	unsigned int align;
+	int ret;
+
+
+	ret = arch_build_bp_info(bp);
+	if (ret)
+		return ret;
+
+	ret = -EINVAL;
+
+	if (info->type == X86_BREAKPOINT_EXECUTE)
+		/*
+		 * Ptrace-refactoring code
+		 * For now, we'll allow instruction breakpoint only for user-space
+		 * addresses
+		 */
+		if ((!arch_check_va_in_userspace(info->address, info->len)) &&
+			info->len != X86_BREAKPOINT_EXECUTE)
+			return ret;
+
+	switch (info->len) {
+	case X86_BREAKPOINT_LEN_1:
+		align = 0;
+		break;
+	case X86_BREAKPOINT_LEN_2:
+		align = 1;
+		break;
+	case X86_BREAKPOINT_LEN_4:
+		align = 3;
+		break;
+#ifdef CONFIG_X86_64
+	case X86_BREAKPOINT_LEN_8:
 		align = 7;
 		break;
 #endif
@@ -246,8 +351,8 @@ int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
 		return ret;
 	}
 
-	if (bp->triggered)
-		ret = arch_store_info(bp, tsk);
+	if (bp->callback)
+		ret = arch_store_info(bp);
 
 	if (ret < 0)
 		return ret;
@@ -255,44 +360,47 @@ int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp,
 	 * Check that the low-order bits of the address are appropriate
 	 * for the alignment implied by len.
 	 */
-	if (bp->info.address & align)
+	if (info->address & align)
 		return -EINVAL;
 
 	/* Check that the virtual address is in the proper range */
 	if (tsk) {
-		if (!arch_check_va_in_userspace(bp->info.address, bp->info.len))
+		if (!arch_check_va_in_userspace(info->address, info->len))
 			return -EFAULT;
 	} else {
-		if (!arch_check_va_in_kernelspace(bp->info.address,
-								bp->info.len))
+		if (!arch_check_va_in_kernelspace(info->address, info->len))
 			return -EFAULT;
 	}
+
 	return 0;
 }
 
-void arch_update_user_hw_breakpoint(int pos, struct task_struct *tsk)
+/*
+ * Release the user breakpoints used by ptrace
+ */
+void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
 {
-	struct thread_struct *thread = &(tsk->thread);
-	struct hw_breakpoint *bp = thread->hbp[pos];
-
-	thread->debugreg7 &= ~dr7_masks[pos];
-	if (bp) {
-		thread->debugreg[pos] = bp->info.address;
-		thread->debugreg7 |= encode_dr7(pos, bp->info.len,
-							bp->info.type);
-	} else
-		thread->debugreg[pos] = 0;
+	int i;
+	struct thread_struct *t = &tsk->thread;
+
+	for (i = 0; i < HBP_NUM; i++) {
+		unregister_hw_breakpoint(t->ptrace_bps[i]);
+		t->ptrace_bps[i] = NULL;
+	}
 }
 
-void arch_flush_thread_hw_breakpoint(struct task_struct *tsk)
+#ifdef CONFIG_KVM
+void hw_breakpoint_restore(void)
 {
-	int i;
-	struct thread_struct *thread = &(tsk->thread);
-
-	thread->debugreg7 = 0;
-	for (i = 0; i < HBP_NUM; i++)
-		thread->debugreg[i] = 0;
+	set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0);
+	set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1);
+	set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2);
+	set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3);
+	set_debugreg(current->thread.debugreg6, 6);
+	set_debugreg(__get_cpu_var(dr7), 7);
 }
+EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
+#endif
 
 /*
  * Handle debug exception notifications.
@@ -313,7 +421,7 @@ void arch_flush_thread_hw_breakpoint(struct task_struct *tsk)
 static int __kprobes hw_breakpoint_handler(struct die_args *args)
 {
 	int i, cpu, rc = NOTIFY_STOP;
-	struct hw_breakpoint *bp;
+	struct perf_event *bp;
 	unsigned long dr7, dr6;
 	unsigned long *dr6_p;
 
@@ -325,10 +433,6 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
 	if ((dr6 & DR_TRAP_BITS) == 0)
 		return NOTIFY_DONE;
 
-	/* Lazy debug register switching */
-	if (!test_tsk_thread_flag(current, TIF_DEBUG))
-		arch_uninstall_thread_hw_breakpoint();
-
 	get_debugreg(dr7, 7);
 	/* Disable breakpoints during exception handling */
 	set_debugreg(0UL, 7);
@@ -344,17 +448,18 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
 	for (i = 0; i < HBP_NUM; ++i) {
 		if (likely(!(dr6 & (DR_TRAP0 << i))))
 			continue;
+
 		/*
-		 * Find the corresponding hw_breakpoint structure and
-		 * invoke its triggered callback.
+		 * The counter may be concurrently released but that can only
+		 * occur from a call_rcu() path. We can then safely fetch
+		 * the breakpoint, use its callback, touch its counter
+		 * while we are in an rcu_read_lock() path.
 		 */
-		if (i >= hbp_kernel_pos)
-			bp = per_cpu(this_hbp_kernel[i], cpu);
-		else {
-			bp = current->thread.hbp[i];
-			if (bp)
-				rc = NOTIFY_DONE;
-		}
+		rcu_read_lock();
+
+		bp = per_cpu(bp_per_reg[i], cpu);
+		if (bp)
+			rc = NOTIFY_DONE;
 		/*
 		 * Reset the 'i'th TRAP bit in dr6 to denote completion of
 		 * exception handling
@@ -362,19 +467,23 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
 		(*dr6_p) &= ~(DR_TRAP0 << i);
 		/*
 		 * bp can be NULL due to lazy debug register switching
-		 * or due to the delay between updates of hbp_kernel_pos
-		 * and this_hbp_kernel.
+		 * or due to concurrent perf counter removing.
 		 */
-		if (!bp)
-			continue;
+		if (!bp) {
+			rcu_read_unlock();
+			break;
+		}
+
+		(bp->callback)(bp, args->regs);
 
-		(bp->triggered)(bp, args->regs);
+		rcu_read_unlock();
 	}
 	if (dr6 & (~DR_TRAP_BITS))
 		rc = NOTIFY_DONE;
 
 	set_debugreg(dr7, 7);
 	put_cpu();
+
 	return rc;
 }
 
@@ -389,3 +498,13 @@ int __kprobes hw_breakpoint_exceptions_notify(
 
 	return hw_breakpoint_handler(data);
 }
+
+void hw_breakpoint_pmu_read(struct perf_event *bp)
+{
+	/* TODO */
+}
+
+void hw_breakpoint_pmu_unthrottle(struct perf_event *bp)
+{
+	/* TODO */
+}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index cf8ee0016307..744508e7cfdd 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -10,6 +10,7 @@
 #include <linux/clockchips.h>
 #include <linux/random.h>
 #include <trace/events/power.h>
+#include <linux/hw_breakpoint.h>
 #include <asm/system.h>
 #include <asm/apic.h>
 #include <asm/syscalls.h>
@@ -18,7 +19,6 @@
 #include <asm/i387.h>
 #include <asm/ds.h>
 #include <asm/debugreg.h>
-#include <asm/hw_breakpoint.h>
 
 unsigned long idle_halt;
 EXPORT_SYMBOL(idle_halt);
@@ -47,8 +47,6 @@ void free_thread_xstate(struct task_struct *tsk)
 		kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
 		tsk->thread.xstate = NULL;
 	}
-	if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG)))
-		flush_thread_hw_breakpoint(tsk);
 
 	WARN(tsk->thread.ds_ctx, "leaking DS context\n");
 }
@@ -107,8 +105,7 @@ void flush_thread(void)
 	}
 #endif
 
-	if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG)))
-		flush_thread_hw_breakpoint(tsk);
+	flush_ptrace_hw_breakpoint(tsk);
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 	/*
 	 * Forget coprocessor state..
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 209e74801763..d5bd3132ee70 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -59,7 +59,6 @@
 #include <asm/syscalls.h>
 #include <asm/ds.h>
 #include <asm/debugreg.h>
-#include <asm/hw_breakpoint.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
@@ -264,9 +263,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 	p->thread.io_bitmap_ptr = NULL;
 	tsk = current;
 	err = -ENOMEM;
-	if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG)))
-		if (copy_thread_hw_breakpoint(tsk, p, clone_flags))
-			goto out;
+
+	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 
 	if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
@@ -287,13 +285,10 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 		err = do_set_thread_area(p, -1,
 			(struct user_desc __user *)childregs->si, 0);
 
-out:
 	if (err && p->thread.io_bitmap_ptr) {
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
-	if (err)
-		flush_thread_hw_breakpoint(p);
 
 	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
 	p->thread.ds_ctx = NULL;
@@ -437,23 +432,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 		lazy_load_gs(next->gs);
 
 	percpu_write(current_task, next_p);
-	/*
-	 * There's a problem with moving the arch_install_thread_hw_breakpoint()
-	 * call before current is updated.  Suppose a kernel breakpoint is
-	 * triggered in between the two, the hw-breakpoint handler will see that
-	 * the 'current' task does not have TIF_DEBUG flag set and will think it
-	 * is leftover from an old task (lazy switching) and will erase it. Then
-	 * until the next context switch, no user-breakpoints will be installed.
-	 *
-	 * The real problem is that it's impossible to update both current and
-	 * physical debug registers at the same instant, so there will always be
-	 * a window in which they disagree and a breakpoint might get triggered.
-	 * Since we use lazy switching, we are forced to assume that a
-	 * disagreement means that current is correct and the exception is due
-	 * to lazy debug register switching.
-	 */
-	if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
-		arch_install_thread_hw_breakpoint(next_p);
 
 	return prev_p;
 }
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 72edac026a78..5bafdec34441 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -53,7 +53,6 @@
 #include <asm/syscalls.h>
 #include <asm/ds.h>
 #include <asm/debugreg.h>
-#include <asm/hw_breakpoint.h>
 
 asmlinkage extern void ret_from_fork(void);
 
@@ -244,8 +243,6 @@ void release_thread(struct task_struct *dead_task)
 			BUG();
 		}
 	}
-	if (unlikely(dead_task->thread.debugreg7))
-		flush_thread_hw_breakpoint(dead_task);
 }
 
 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
@@ -309,9 +306,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 	savesegment(ds, p->thread.ds);
 
 	err = -ENOMEM;
-	if (unlikely(test_tsk_thread_flag(me, TIF_DEBUG)))
-		if (copy_thread_hw_breakpoint(me, p, clone_flags))
-			goto out;
+	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 
 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
@@ -351,8 +346,6 @@ out:
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
-	if (err)
-		flush_thread_hw_breakpoint(p);
 
 	return err;
 }
@@ -508,23 +501,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 */
 	if (preload_fpu)
 		__math_state_restore();
-	/*
-	 * There's a problem with moving the arch_install_thread_hw_breakpoint()
-	 * call before current is updated.  Suppose a kernel breakpoint is
-	 * triggered in between the two, the hw-breakpoint handler will see that
-	 * the 'current' task does not have TIF_DEBUG flag set and will think it
-	 * is leftover from an old task (lazy switching) and will erase it. Then
-	 * until the next context switch, no user-breakpoints will be installed.
-	 *
-	 * The real problem is that it's impossible to update both current and
-	 * physical debug registers at the same instant, so there will always be
-	 * a window in which they disagree and a breakpoint might get triggered.
-	 * Since we use lazy switching, we are forced to assume that a
-	 * disagreement means that current is correct and the exception is due
-	 * to lazy debug register switching.
-	 */
-	if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
-		arch_install_thread_hw_breakpoint(next_p);
 
 	return prev_p;
 }
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 267cb85b479c..e79610d95971 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -22,6 +22,8 @@
 #include <linux/seccomp.h>
 #include <linux/signal.h>
 #include <linux/workqueue.h>
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -441,54 +443,59 @@ static int genregs_set(struct task_struct *target,
 	return ret;
 }
 
-/*
- * Decode the length and type bits for a particular breakpoint as
- * stored in debug register 7.  Return the "enabled" status.
- */
-static int decode_dr7(unsigned long dr7, int bpnum, unsigned *len,
-		unsigned *type)
-{
-	int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
-
-	*len = (bp_info & 0xc) | 0x40;
-	*type = (bp_info & 0x3) | 0x80;
-	return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
-}
-
-static void ptrace_triggered(struct hw_breakpoint *bp, struct pt_regs *regs)
+static void ptrace_triggered(struct perf_event *bp, void *data)
 {
-	struct thread_struct *thread = &(current->thread);
 	int i;
+	struct thread_struct *thread = &(current->thread);
 
 	/*
 	 * Store in the virtual DR6 register the fact that the breakpoint
 	 * was hit so the thread's debugger will see it.
 	 */
-	for (i = 0; i < hbp_kernel_pos; i++)
-		/*
-		 * We will check bp->info.address against the address stored in
-		 * thread's hbp structure and not debugreg[i]. This is to ensure
-		 * that the corresponding bit for 'i' in DR7 register is enabled
-		 */
-		if (bp->info.address == thread->hbp[i]->info.address)
+	for (i = 0; i < HBP_NUM; i++) {
+		if (thread->ptrace_bps[i] == bp)
 			break;
+	}
 
 	thread->debugreg6 |= (DR_TRAP0 << i);
 }
 
+/*
+ * Walk through every ptrace breakpoints for this thread and
+ * build the dr7 value on top of their attributes.
+ *
+ */
+static unsigned long ptrace_get_dr7(struct perf_event *bp[])
+{
+	int i;
+	int dr7 = 0;
+	struct arch_hw_breakpoint *info;
+
+	for (i = 0; i < HBP_NUM; i++) {
+		if (bp[i] && !bp[i]->attr.disabled) {
+			info = counter_arch_bp(bp[i]);
+			dr7 |= encode_dr7(i, info->len, info->type);
+		}
+	}
+
+	return dr7;
+}
+
 /*
  * Handle ptrace writes to debug register 7.
  */
 static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
 {
 	struct thread_struct *thread = &(tsk->thread);
-	unsigned long old_dr7 = thread->debugreg7;
+	unsigned long old_dr7;
 	int i, orig_ret = 0, rc = 0;
 	int enabled, second_pass = 0;
 	unsigned len, type;
-	struct hw_breakpoint *bp;
+	int gen_len, gen_type;
+	struct perf_event *bp;
 
 	data &= ~DR_CONTROL_RESERVED;
+	old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
 restore:
 	/*
 	 * Loop through all the hardware breakpoints, making the
@@ -496,11 +503,12 @@ restore:
 	 */
 	for (i = 0; i < HBP_NUM; i++) {
 		enabled = decode_dr7(data, i, &len, &type);
-		bp = thread->hbp[i];
+		bp = thread->ptrace_bps[i];
 
 		if (!enabled) {
 			if (bp) {
-				/* Don't unregister the breakpoints right-away,
+				/*
+				 * Don't unregister the breakpoints right-away,
 				 * unless all register_user_hw_breakpoint()
 				 * requests have succeeded. This prevents
 				 * any window of opportunity for debug
@@ -508,27 +516,45 @@ restore:
 				 */
 				if (!second_pass)
 					continue;
-				unregister_user_hw_breakpoint(tsk, bp);
-				kfree(bp);
+				thread->ptrace_bps[i] = NULL;
+				unregister_hw_breakpoint(bp);
 			}
 			continue;
 		}
+
+		/*
+		 * We shoud have at least an inactive breakpoint at this
+		 * slot. It means the user is writing dr7 without having
+		 * written the address register first
+		 */
 		if (!bp) {
-			rc = -ENOMEM;
-			bp = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL);
-			if (bp) {
-				bp->info.address = thread->debugreg[i];
-				bp->triggered = ptrace_triggered;
-				bp->info.len = len;
-				bp->info.type = type;
-				rc = register_user_hw_breakpoint(tsk, bp);
-				if (rc)
-					kfree(bp);
-			}
-		} else
-			rc = modify_user_hw_breakpoint(tsk, bp);
+			rc = -EINVAL;
+			break;
+		}
+
+		rc = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
 		if (rc)
 			break;
+
+		/*
+		 * This is a temporary thing as bp is unregistered/registered
+		 * to simulate modification
+		 */
+		bp = modify_user_hw_breakpoint(bp, bp->attr.bp_addr, gen_len,
+					       gen_type, bp->callback,
+					       tsk, true);
+		thread->ptrace_bps[i] = NULL;
+
+		if (!bp) { /* incorrect bp, or we have a bug in bp API */
+			rc = -EINVAL;
+			break;
+		}
+		if (IS_ERR(bp)) {
+			rc = PTR_ERR(bp);
+			bp = NULL;
+			break;
+		}
+		thread->ptrace_bps[i] = bp;
 	}
 	/*
 	 * Make a second pass to free the remaining unused breakpoints
@@ -553,15 +579,63 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
 	struct thread_struct *thread = &(tsk->thread);
 	unsigned long val = 0;
 
-	if (n < HBP_NUM)
-		val = thread->debugreg[n];
-	else if (n == 6)
+	if (n < HBP_NUM) {
+		struct perf_event *bp;
+		bp = thread->ptrace_bps[n];
+		if (!bp)
+			return 0;
+		val = bp->hw.info.address;
+	} else if (n == 6) {
 		val = thread->debugreg6;
-	else if (n == 7)
-		val = thread->debugreg7;
+	 } else if (n == 7) {
+		val = ptrace_get_dr7(thread->ptrace_bps);
+	}
 	return val;
 }
 
+static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
+				      unsigned long addr)
+{
+	struct perf_event *bp;
+	struct thread_struct *t = &tsk->thread;
+
+	if (!t->ptrace_bps[nr]) {
+		/*
+		 * Put stub len and type to register (reserve) an inactive but
+		 * correct bp
+		 */
+		bp = register_user_hw_breakpoint(addr, HW_BREAKPOINT_LEN_1,
+						 HW_BREAKPOINT_W,
+						 ptrace_triggered, tsk,
+						 false);
+	} else {
+		bp = t->ptrace_bps[nr];
+		t->ptrace_bps[nr] = NULL;
+		bp = modify_user_hw_breakpoint(bp, addr, bp->attr.bp_len,
+					       bp->attr.bp_type,
+					       bp->callback,
+					       tsk,
+					       bp->attr.disabled);
+	}
+
+	if (!bp)
+		return -EIO;
+	/*
+	 * CHECKME: the previous code returned -EIO if the addr wasn't a
+	 * valid task virtual addr. The new one will return -EINVAL in this
+	 * case.
+	 * -EINVAL may be what we want for in-kernel breakpoints users, but
+	 * -EIO looks better for ptrace, since we refuse a register writing
+	 * for the user. And anyway this is the previous behaviour.
+	 */
+	if (IS_ERR(bp))
+		return PTR_ERR(bp);
+
+	t->ptrace_bps[nr] = bp;
+
+	return 0;
+}
+
 /*
  * Handle PTRACE_POKEUSR calls for the debug register area.
  */
@@ -575,19 +649,13 @@ int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
 		return -EIO;
 
 	if (n == 6) {
-		tsk->thread.debugreg6 = val;
+		thread->debugreg6 = val;
 		goto ret_path;
 	}
 	if (n < HBP_NUM) {
-		if (thread->hbp[n]) {
-			if (arch_check_va_in_userspace(val,
-					thread->hbp[n]->info.len) == 0) {
-				rc = -EIO;
-				goto ret_path;
-			}
-			thread->hbp[n]->info.address = val;
-		}
-		thread->debugreg[n] = val;
+		rc = ptrace_set_breakpoint_addr(tsk, n, val);
+		if (rc)
+			return rc;
 	}
 	/* All that's left is DR7 */
 	if (n == 7)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 213a7a3e4562..565ebc65920e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -64,7 +64,6 @@
 #include <asm/apic.h>
 #include <asm/setup.h>
 #include <asm/uv/uv.h>
-#include <asm/debugreg.h>
 #include <linux/mc146818rtc.h>
 
 #include <asm/smpboot_hooks.h>
@@ -328,7 +327,6 @@ notrace static void __cpuinit start_secondary(void *unused)
 	x86_cpuinit.setup_percpu_clockev();
 
 	wmb();
-	load_debug_registers();
 	cpu_idle();
 }
 
@@ -1269,7 +1267,6 @@ void cpu_disable_common(void)
 	remove_cpu_from_maps(cpu);
 	unlock_vector_lock();
 	fixup_irqs();
-	hw_breakpoint_disable();
 }
 
 int native_cpu_disable(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fc2974adf9b6..22dee7aa7813 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -42,6 +42,7 @@
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 
+#include <asm/debugreg.h>
 #include <asm/uaccess.h>
 #include <asm/msr.h>
 #include <asm/desc.h>
@@ -3643,14 +3644,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	trace_kvm_entry(vcpu->vcpu_id);
 	kvm_x86_ops->run(vcpu, kvm_run);
 
-	if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) {
-		set_debugreg(current->thread.debugreg[0], 0);
-		set_debugreg(current->thread.debugreg[1], 1);
-		set_debugreg(current->thread.debugreg[2], 2);
-		set_debugreg(current->thread.debugreg[3], 3);
-		set_debugreg(current->thread.debugreg6, 6);
-		set_debugreg(current->thread.debugreg7, 7);
-	}
+	/*
+	 * If the guest has used debug registers, at least dr7
+	 * will be disabled while returning to the host.
+	 * If we don't have active breakpoints in the host, we don't
+	 * care about the messed up debug address registers. But if
+	 * we have some of them active, restore the old state.
+	 */
+	if (__get_cpu_var(dr7) & DR_GLOBAL_ENABLE_MASK)
+		hw_breakpoint_restore();
 
 	set_bit(KVM_REQ_KICK, &vcpu->requests);
 	local_irq_enable();
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index e09a44fc4664..0a979f3e5b8a 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -105,7 +105,6 @@ static void __save_processor_state(struct saved_context *ctxt)
 	ctxt->cr4 = read_cr4();
 	ctxt->cr8 = read_cr8();
 #endif
-	hw_breakpoint_disable();
 }
 
 /* Needed by apm.c */
@@ -144,11 +143,6 @@ static void fix_processor_context(void)
 #endif
 	load_TR_desc();				/* This does ltr */
 	load_LDT(&current->active_mm->context);	/* This does lldt */
-
-	/*
-	 * Now maybe reload the debug registers
-	 */
-	load_debug_registers();
 }
 
 /**
diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index 61ccc8f17eac..7eba9b92e5f3 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -1,136 +1,131 @@
 #ifndef _LINUX_HW_BREAKPOINT_H
 #define _LINUX_HW_BREAKPOINT_H
 
+#include <linux/perf_event.h>
 
-#ifdef	__KERNEL__
-#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/kallsyms.h>
-
-/**
- * struct hw_breakpoint - unified kernel/user-space hardware breakpoint
- * @triggered: callback invoked after target address access
- * @info: arch-specific breakpoint info (address, length, and type)
- *
- * %hw_breakpoint structures are the kernel's way of representing
- * hardware breakpoints.  These are data breakpoints
- * (also known as "watchpoints", triggered on data access), and the breakpoint's
- * target address can be located in either kernel space or user space.
- *
- * The breakpoint's address, length, and type are highly
- * architecture-specific.  The values are encoded in the @info field; you
- * specify them when registering the breakpoint.  To examine the encoded
- * values use hw_breakpoint_get_{kaddress,uaddress,len,type}(), declared
- * below.
- *
- * The address is specified as a regular kernel pointer (for kernel-space
- * breakponts) or as an %__user pointer (for user-space breakpoints).
- * With register_user_hw_breakpoint(), the address must refer to a
- * location in user space.  The breakpoint will be active only while the
- * requested task is running.  Conversely with
- * register_kernel_hw_breakpoint(), the address must refer to a location
- * in kernel space, and the breakpoint will be active on all CPUs
- * regardless of the current task.
- *
- * The length is the breakpoint's extent in bytes, which is subject to
- * certain limitations.  include/asm/hw_breakpoint.h contains macros
- * defining the available lengths for a specific architecture.  Note that
- * the address's alignment must match the length.  The breakpoint will
- * catch accesses to any byte in the range from address to address +
- * (length - 1).
- *
- * The breakpoint's type indicates the sort of access that will cause it
- * to trigger.  Possible values may include:
- *
- * 	%HW_BREAKPOINT_RW (triggered on read or write access),
- * 	%HW_BREAKPOINT_WRITE (triggered on write access), and
- * 	%HW_BREAKPOINT_READ (triggered on read access).
- *
- * Appropriate macros are defined in include/asm/hw_breakpoint.h; not all
- * possibilities are available on all architectures.  Execute breakpoints
- * must have length equal to the special value %HW_BREAKPOINT_LEN_EXECUTE.
- *
- * When a breakpoint gets hit, the @triggered callback is
- * invoked in_interrupt with a pointer to the %hw_breakpoint structure and the
- * processor registers.
- * Data breakpoints occur after the memory access has taken place.
- * Breakpoints are disabled during execution @triggered, to avoid
- * recursive traps and allow unhindered access to breakpointed memory.
- *
- * This sample code sets a breakpoint on pid_max and registers a callback
- * function for writes to that variable.  Note that it is not portable
- * as written, because not all architectures support HW_BREAKPOINT_LEN_4.
- *
- * ----------------------------------------------------------------------
- *
- * #include <asm/hw_breakpoint.h>
- *
- * struct hw_breakpoint my_bp;
- *
- * static void my_triggered(struct hw_breakpoint *bp, struct pt_regs *regs)
- * {
- * 	printk(KERN_DEBUG "Inside triggered routine of breakpoint exception\n");
- * 	dump_stack();
- *  	.......<more debugging output>........
- * }
- *
- * static struct hw_breakpoint my_bp;
- *
- * static int init_module(void)
- * {
- *	..........<do anything>............
- *	my_bp.info.type = HW_BREAKPOINT_WRITE;
- *	my_bp.info.len = HW_BREAKPOINT_LEN_4;
- *
- *	my_bp.installed = (void *)my_bp_installed;
- *
- *	rc = register_kernel_hw_breakpoint(&my_bp);
- *	..........<do anything>............
- * }
- *
- * static void cleanup_module(void)
- * {
- *	..........<do anything>............
- *	unregister_kernel_hw_breakpoint(&my_bp);
- *	..........<do anything>............
- * }
- *
- * ----------------------------------------------------------------------
- */
-struct hw_breakpoint {
-	void (*triggered)(struct hw_breakpoint *, struct pt_regs *);
-	struct arch_hw_breakpoint info;
+enum {
+	HW_BREAKPOINT_LEN_1 = 1,
+	HW_BREAKPOINT_LEN_2 = 2,
+	HW_BREAKPOINT_LEN_4 = 4,
+	HW_BREAKPOINT_LEN_8 = 8,
 };
 
-/*
- * len and type values are defined in include/asm/hw_breakpoint.h.
- * Available values vary according to the architecture.  On i386 the
- * possibilities are:
- *
- *	HW_BREAKPOINT_LEN_1
- *	HW_BREAKPOINT_LEN_2
- *	HW_BREAKPOINT_LEN_4
- *	HW_BREAKPOINT_RW
- *	HW_BREAKPOINT_READ
- *
- * On other architectures HW_BREAKPOINT_LEN_8 may be available, and the
- * 1-, 2-, and 4-byte lengths may be unavailable.  There also may be
- * HW_BREAKPOINT_WRITE.  You can use #ifdef to check at compile time.
- */
+enum {
+	HW_BREAKPOINT_R = 1,
+	HW_BREAKPOINT_W = 2,
+	HW_BREAKPOINT_X = 4,
+};
+
+static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp)
+{
+	return &bp->hw.info;
+}
+
+static inline unsigned long hw_breakpoint_addr(struct perf_event *bp)
+{
+	return bp->attr.bp_addr;
+}
+
+static inline int hw_breakpoint_type(struct perf_event *bp)
+{
+	return bp->attr.bp_type;
+}
+
+static inline int hw_breakpoint_len(struct perf_event *bp)
+{
+	return bp->attr.bp_len;
+}
+
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+extern struct perf_event *
+register_user_hw_breakpoint(unsigned long addr,
+			    int len,
+			    int type,
+			    perf_callback_t triggered,
+			    struct task_struct *tsk,
+			    bool active);
+
+/* FIXME: only change from the attr, and don't unregister */
+extern struct perf_event *
+modify_user_hw_breakpoint(struct perf_event *bp,
+			  unsigned long addr,
+			  int len,
+			  int type,
+			  perf_callback_t triggered,
+			  struct task_struct *tsk,
+			  bool active);
 
-extern int register_user_hw_breakpoint(struct task_struct *tsk,
-					struct hw_breakpoint *bp);
-extern int modify_user_hw_breakpoint(struct task_struct *tsk,
-					struct hw_breakpoint *bp);
-extern void unregister_user_hw_breakpoint(struct task_struct *tsk,
-						struct hw_breakpoint *bp);
 /*
  * Kernel breakpoints are not associated with any particular thread.
  */
-extern int register_kernel_hw_breakpoint(struct hw_breakpoint *bp);
-extern void unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp);
+extern struct perf_event *
+register_wide_hw_breakpoint_cpu(unsigned long addr,
+				int len,
+				int type,
+				perf_callback_t triggered,
+				int cpu,
+				bool active);
+
+extern struct perf_event **
+register_wide_hw_breakpoint(unsigned long addr,
+			    int len,
+			    int type,
+			    perf_callback_t triggered,
+			    bool active);
+
+extern int register_perf_hw_breakpoint(struct perf_event *bp);
+extern int __register_perf_hw_breakpoint(struct perf_event *bp);
+extern void unregister_hw_breakpoint(struct perf_event *bp);
+extern void unregister_wide_hw_breakpoint(struct perf_event **cpu_events);
+
+extern int reserve_bp_slot(struct perf_event *bp);
+extern void release_bp_slot(struct perf_event *bp);
+
+extern void flush_ptrace_hw_breakpoint(struct task_struct *tsk);
+
+#else /* !CONFIG_HAVE_HW_BREAKPOINT */
+
+static inline struct perf_event *
+register_user_hw_breakpoint(unsigned long addr,
+			    int len,
+			    int type,
+			    perf_callback_t triggered,
+			    struct task_struct *tsk,
+			    bool active)		{ return NULL; }
+static inline struct perf_event *
+modify_user_hw_breakpoint(struct perf_event *bp,
+			  unsigned long addr,
+			  int len,
+			  int type,
+			  perf_callback_t triggered,
+			  struct task_struct *tsk,
+			  bool active)			{ return NULL; }
+static inline struct perf_event *
+register_wide_hw_breakpoint_cpu(unsigned long addr,
+				int len,
+				int type,
+				perf_callback_t triggered,
+				int cpu,
+				bool active)		{ return NULL; }
+static inline struct perf_event **
+register_wide_hw_breakpoint(unsigned long addr,
+			    int len,
+			    int type,
+			    perf_callback_t triggered,
+			    bool active)		{ return NULL; }
+static inline int
+register_perf_hw_breakpoint(struct perf_event *bp)	{ return -ENOSYS; }
+static inline int
+__register_perf_hw_breakpoint(struct perf_event *bp) 	{ return -ENOSYS; }
+static inline void unregister_hw_breakpoint(struct perf_event *bp)	{ }
+static inline void
+unregister_wide_hw_breakpoint(struct perf_event **cpu_events)		{ }
+static inline int
+reserve_bp_slot(struct perf_event *bp)			{return -ENOSYS; }
+static inline void release_bp_slot(struct perf_event *bp) 		{ }
+
+static inline void flush_ptrace_hw_breakpoint(struct task_struct *tsk)	{ }
 
-extern unsigned int hbp_kernel_pos;
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
 
-#endif	/* __KERNEL__ */
-#endif	/* _LINUX_HW_BREAKPOINT_H */
+#endif /* _LINUX_HW_BREAKPOINT_H */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8d54e6d25eeb..cead64ea6c15 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -18,6 +18,10 @@
 #include <linux/ioctl.h>
 #include <asm/byteorder.h>
 
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+#include <asm/hw_breakpoint.h>
+#endif
+
 /*
  * User-space ABI bits:
  */
@@ -31,6 +35,7 @@ enum perf_type_id {
 	PERF_TYPE_TRACEPOINT			= 2,
 	PERF_TYPE_HW_CACHE			= 3,
 	PERF_TYPE_RAW				= 4,
+	PERF_TYPE_BREAKPOINT			= 5,
 
 	PERF_TYPE_MAX,				/* non-ABI */
 };
@@ -207,6 +212,15 @@ struct perf_event_attr {
 		__u32		wakeup_events;	  /* wakeup every n events */
 		__u32		wakeup_watermark; /* bytes before wakeup   */
 	};
+
+	union {
+		struct { /* Hardware breakpoint info */
+			__u64		bp_addr;
+			__u32		bp_type;
+			__u32		bp_len;
+		};
+	};
+
 	__u32			__reserved_2;
 
 	__u64			__reserved_3;
@@ -476,6 +490,11 @@ struct hw_perf_event {
 			atomic64_t	count;
 			struct hrtimer	hrtimer;
 		};
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+		union { /* breakpoint */
+			struct arch_hw_breakpoint	info;
+		};
+#endif
 	};
 	atomic64_t			prev_count;
 	u64				sample_period;
@@ -588,7 +607,7 @@ struct perf_event {
 	u64				tstamp_running;
 	u64				tstamp_stopped;
 
-	struct perf_event_attr	attr;
+	struct perf_event_attr		attr;
 	struct hw_perf_event		hw;
 
 	struct perf_event_context	*ctx;
@@ -643,6 +662,8 @@ struct perf_event {
 
 	perf_callback_t			callback;
 
+	perf_callback_t			event_callback;
+
 #endif /* CONFIG_PERF_EVENTS */
 };
 
@@ -831,6 +852,7 @@ extern int sysctl_perf_event_sample_rate;
 extern void perf_event_init(void);
 extern void perf_tp_event(int event_id, u64 addr, u64 count,
 				 void *record, int entry_size);
+extern void perf_bp_event(struct perf_event *event, void *data);
 
 #ifndef perf_misc_flags
 #define perf_misc_flags(regs)	(user_mode(regs) ? PERF_RECORD_MISC_USER : \
@@ -865,6 +887,8 @@ static inline int perf_event_task_enable(void)				{ return -EINVAL; }
 static inline void
 perf_sw_event(u32 event_id, u64 nr, int nmi,
 		     struct pt_regs *regs, u64 addr)			{ }
+static inline void
+perf_bp_event(struct perf_event *event, void *data)		{ }
 
 static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
 static inline void perf_event_comm(struct task_struct *tsk)		{ }
diff --git a/kernel/exit.c b/kernel/exit.c
index e61891f80123..266f8920628a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -49,6 +49,7 @@
 #include <linux/init_task.h>
 #include <linux/perf_event.h>
 #include <trace/events/sched.h>
+#include <linux/hw_breakpoint.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -979,6 +980,10 @@ NORET_TYPE void do_exit(long code)
 
 	proc_exit_connector(tsk);
 
+	/*
+	 * FIXME: do that only when needed, using sched_exit tracepoint
+	 */
+	flush_ptrace_hw_breakpoint(tsk);
 	/*
 	 * Flush inherited counters to the parent - before the parent
 	 * gets woken up by child-exit notifications.
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index c1f64e65a9f3..08f6d0163201 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -15,6 +15,7 @@
  *
  * Copyright (C) 2007 Alan Stern
  * Copyright (C) IBM Corporation, 2009
+ * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
  */
 
 /*
@@ -35,334 +36,242 @@
 #include <linux/init.h>
 #include <linux/smp.h>
 
-#include <asm/hw_breakpoint.h>
+#include <linux/hw_breakpoint.h>
+
 #include <asm/processor.h>
 
 #ifdef CONFIG_X86
 #include <asm/debugreg.h>
 #endif
-/*
- * Spinlock that protects all (un)register operations over kernel/user-space
- * breakpoint requests
- */
-static DEFINE_SPINLOCK(hw_breakpoint_lock);
-
-/* Array of kernel-space breakpoint structures */
-struct hw_breakpoint *hbp_kernel[HBP_NUM];
-
-/*
- * Per-processor copy of hbp_kernel[]. Used only when hbp_kernel is being
- * modified but we need the older copy to handle any hbp exceptions. It will
- * sync with hbp_kernel[] value after updation is done through IPIs.
- */
-DEFINE_PER_CPU(struct hw_breakpoint*, this_hbp_kernel[HBP_NUM]);
-
-/*
- * Kernel breakpoints grow downwards, starting from HBP_NUM
- * 'hbp_kernel_pos' denotes lowest numbered breakpoint register occupied for
- * kernel-space request. We will initialise it here and not in an __init
- * routine because load_debug_registers(), which uses this variable can be
- * called very early during CPU initialisation.
- */
-unsigned int hbp_kernel_pos = HBP_NUM;
 
-/*
- * An array containing refcount of threads using a given bkpt register
- * Accesses are synchronised by acquiring hw_breakpoint_lock
- */
-unsigned int hbp_user_refcount[HBP_NUM];
+static atomic_t bp_slot;
 
-/*
- * Load the debug registers during startup of a CPU.
- */
-void load_debug_registers(void)
+int reserve_bp_slot(struct perf_event *bp)
 {
-	unsigned long flags;
-	struct task_struct *tsk = current;
-
-	spin_lock_bh(&hw_breakpoint_lock);
-
-	/* Prevent IPIs for new kernel breakpoint updates */
-	local_irq_save(flags);
-	arch_update_kernel_hw_breakpoint(NULL);
-	local_irq_restore(flags);
-
-	if (test_tsk_thread_flag(tsk, TIF_DEBUG))
-		arch_install_thread_hw_breakpoint(tsk);
-
-	spin_unlock_bh(&hw_breakpoint_lock);
-}
+	if (atomic_inc_return(&bp_slot) == HBP_NUM) {
+		atomic_dec(&bp_slot);
 
-/*
- * Erase all the hardware breakpoint info associated with a thread.
- *
- * If tsk != current then tsk must not be usable (for example, a
- * child being cleaned up from a failed fork).
- */
-void flush_thread_hw_breakpoint(struct task_struct *tsk)
-{
-	int i;
-	struct thread_struct *thread = &(tsk->thread);
-
-	spin_lock_bh(&hw_breakpoint_lock);
-
-	/* The thread no longer has any breakpoints associated with it */
-	clear_tsk_thread_flag(tsk, TIF_DEBUG);
-	for (i = 0; i < HBP_NUM; i++) {
-		if (thread->hbp[i]) {
-			hbp_user_refcount[i]--;
-			kfree(thread->hbp[i]);
-			thread->hbp[i] = NULL;
-		}
+		return -ENOSPC;
 	}
 
-	arch_flush_thread_hw_breakpoint(tsk);
-
-	/* Actually uninstall the breakpoints if necessary */
-	if (tsk == current)
-		arch_uninstall_thread_hw_breakpoint();
-	spin_unlock_bh(&hw_breakpoint_lock);
+	return 0;
 }
 
-/*
- * Copy the hardware breakpoint info from a thread to its cloned child.
- */
-int copy_thread_hw_breakpoint(struct task_struct *tsk,
-		struct task_struct *child, unsigned long clone_flags)
+void release_bp_slot(struct perf_event *bp)
 {
-	/*
-	 * We will assume that breakpoint settings are not inherited
-	 * and the child starts out with no debug registers set.
-	 * But what about CLONE_PTRACE?
-	 */
-	clear_tsk_thread_flag(child, TIF_DEBUG);
-
-	/* We will call flush routine since the debugregs are not inherited */
-	arch_flush_thread_hw_breakpoint(child);
-
-	return 0;
+	atomic_dec(&bp_slot);
 }
 
-static int __register_user_hw_breakpoint(int pos, struct task_struct *tsk,
-					struct hw_breakpoint *bp)
+int __register_perf_hw_breakpoint(struct perf_event *bp)
 {
-	struct thread_struct *thread = &(tsk->thread);
-	int rc;
+	int ret;
 
-	/* Do not overcommit. Fail if kernel has used the hbp registers */
-	if (pos >= hbp_kernel_pos)
-		return -ENOSPC;
+	ret = reserve_bp_slot(bp);
+	if (ret)
+		return ret;
 
-	rc = arch_validate_hwbkpt_settings(bp, tsk);
-	if (rc)
-		return rc;
+	if (!bp->attr.disabled)
+		ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
 
-	thread->hbp[pos] = bp;
-	hbp_user_refcount[pos]++;
+	return ret;
+}
 
-	arch_update_user_hw_breakpoint(pos, tsk);
-	/*
-	 * Does it need to be installed right now?
-	 * Otherwise it will get installed the next time tsk runs
-	 */
-	if (tsk == current)
-		arch_install_thread_hw_breakpoint(tsk);
+int register_perf_hw_breakpoint(struct perf_event *bp)
+{
+	bp->callback = perf_bp_event;
 
-	return rc;
+	return __register_perf_hw_breakpoint(bp);
 }
 
 /*
- * Modify the address of a hbp register already in use by the task
- * Do not invoke this in-lieu of a __unregister_user_hw_breakpoint()
+ * Register a breakpoint bound to a task and a given cpu.
+ * If cpu is -1, the breakpoint is active for the task in every cpu
+ * If the task is -1, the breakpoint is active for every tasks in the given
+ * cpu.
  */
-static int __modify_user_hw_breakpoint(int pos, struct task_struct *tsk,
-					struct hw_breakpoint *bp)
+static struct perf_event *
+register_user_hw_breakpoint_cpu(unsigned long addr,
+				int len,
+				int type,
+				perf_callback_t triggered,
+				pid_t pid,
+				int cpu,
+				bool active)
 {
-	struct thread_struct *thread = &(tsk->thread);
-
-	if ((pos >= hbp_kernel_pos) || (arch_validate_hwbkpt_settings(bp, tsk)))
-		return -EINVAL;
-
-	if (thread->hbp[pos] == NULL)
-		return -EINVAL;
-
-	thread->hbp[pos] = bp;
+	struct perf_event_attr *attr;
+	struct perf_event *bp;
+
+	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+	if (!attr)
+		return ERR_PTR(-ENOMEM);
+
+	attr->type = PERF_TYPE_BREAKPOINT;
+	attr->size = sizeof(*attr);
+	attr->bp_addr = addr;
+	attr->bp_len = len;
+	attr->bp_type = type;
 	/*
-	 * 'pos' must be that of a hbp register already used by 'tsk'
-	 * Otherwise arch_modify_user_hw_breakpoint() will fail
+	 * Such breakpoints are used by debuggers to trigger signals when
+	 * we hit the excepted memory op. We can't miss such events, they
+	 * must be pinned.
 	 */
-	arch_update_user_hw_breakpoint(pos, tsk);
+	attr->pinned = 1;
 
-	if (tsk == current)
-		arch_install_thread_hw_breakpoint(tsk);
+	if (!active)
+		attr->disabled = 1;
 
-	return 0;
-}
-
-static void __unregister_user_hw_breakpoint(int pos, struct task_struct *tsk)
-{
-	hbp_user_refcount[pos]--;
-	tsk->thread.hbp[pos] = NULL;
+	bp = perf_event_create_kernel_counter(attr, cpu, pid, triggered);
+	kfree(attr);
 
-	arch_update_user_hw_breakpoint(pos, tsk);
-
-	if (tsk == current)
-		arch_install_thread_hw_breakpoint(tsk);
+	return bp;
 }
 
 /**
  * register_user_hw_breakpoint - register a hardware breakpoint for user space
+ * @addr: is the memory address that triggers the breakpoint
+ * @len: the length of the access to the memory (1 byte, 2 bytes etc...)
+ * @type: the type of the access to the memory (read/write/exec)
+ * @triggered: callback to trigger when we hit the breakpoint
  * @tsk: pointer to 'task_struct' of the process to which the address belongs
- * @bp: the breakpoint structure to register
- *
- * @bp.info->name or @bp.info->address, @bp.info->len, @bp.info->type and
- * @bp->triggered must be set properly before invocation
+ * @active: should we activate it while registering it
  *
  */
-int register_user_hw_breakpoint(struct task_struct *tsk,
-					struct hw_breakpoint *bp)
+struct perf_event *
+register_user_hw_breakpoint(unsigned long addr,
+			    int len,
+			    int type,
+			    perf_callback_t triggered,
+			    struct task_struct *tsk,
+			    bool active)
 {
-	struct thread_struct *thread = &(tsk->thread);
-	int i, rc = -ENOSPC;
-
-	spin_lock_bh(&hw_breakpoint_lock);
-
-	for (i = 0; i < hbp_kernel_pos; i++) {
-		if (!thread->hbp[i]) {
-			rc = __register_user_hw_breakpoint(i, tsk, bp);
-			break;
-		}
-	}
-	if (!rc)
-		set_tsk_thread_flag(tsk, TIF_DEBUG);
-
-	spin_unlock_bh(&hw_breakpoint_lock);
-	return rc;
+	return register_user_hw_breakpoint_cpu(addr, len, type, triggered,
+					       tsk->pid, -1, active);
 }
 EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
 
 /**
  * modify_user_hw_breakpoint - modify a user-space hardware breakpoint
+ * @bp: the breakpoint structure to modify
+ * @addr: is the memory address that triggers the breakpoint
+ * @len: the length of the access to the memory (1 byte, 2 bytes etc...)
+ * @type: the type of the access to the memory (read/write/exec)
+ * @triggered: callback to trigger when we hit the breakpoint
  * @tsk: pointer to 'task_struct' of the process to which the address belongs
- * @bp: the breakpoint structure to unregister
- *
+ * @active: should we activate it while registering it
  */
-int modify_user_hw_breakpoint(struct task_struct *tsk, struct hw_breakpoint *bp)
+struct perf_event *
+modify_user_hw_breakpoint(struct perf_event *bp,
+			  unsigned long addr,
+			  int len,
+			  int type,
+			  perf_callback_t triggered,
+			  struct task_struct *tsk,
+			  bool active)
 {
-	struct thread_struct *thread = &(tsk->thread);
-	int i, ret = -ENOENT;
+	/*
+	 * FIXME: do it without unregistering
+	 * - We don't want to lose our slot
+	 * - If the new bp is incorrect, don't lose the older one
+	 */
+	unregister_hw_breakpoint(bp);
 
-	spin_lock_bh(&hw_breakpoint_lock);
-	for (i = 0; i < hbp_kernel_pos; i++) {
-		if (bp == thread->hbp[i]) {
-			ret = __modify_user_hw_breakpoint(i, tsk, bp);
-			break;
-		}
-	}
-	spin_unlock_bh(&hw_breakpoint_lock);
-	return ret;
+	return register_user_hw_breakpoint(addr, len, type, triggered,
+					   tsk, active);
 }
 EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
 
 /**
- * unregister_user_hw_breakpoint - unregister a user-space hardware breakpoint
- * @tsk: pointer to 'task_struct' of the process to which the address belongs
+ * unregister_hw_breakpoint - unregister a user-space hardware breakpoint
  * @bp: the breakpoint structure to unregister
- *
  */
-void unregister_user_hw_breakpoint(struct task_struct *tsk,
-						struct hw_breakpoint *bp)
+void unregister_hw_breakpoint(struct perf_event *bp)
 {
-	struct thread_struct *thread = &(tsk->thread);
-	int i, pos = -1, hbp_counter = 0;
-
-	spin_lock_bh(&hw_breakpoint_lock);
-	for (i = 0; i < hbp_kernel_pos; i++) {
-		if (thread->hbp[i])
-			hbp_counter++;
-		if (bp == thread->hbp[i])
-			pos = i;
-	}
-	if (pos >= 0) {
-		__unregister_user_hw_breakpoint(pos, tsk);
-		hbp_counter--;
-	}
-	if (!hbp_counter)
-		clear_tsk_thread_flag(tsk, TIF_DEBUG);
-
-	spin_unlock_bh(&hw_breakpoint_lock);
+	if (!bp)
+		return;
+	perf_event_release_kernel(bp);
+}
+EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
+
+static struct perf_event *
+register_kernel_hw_breakpoint_cpu(unsigned long addr,
+				  int len,
+				  int type,
+				  perf_callback_t triggered,
+				  int cpu,
+				  bool active)
+{
+	return register_user_hw_breakpoint_cpu(addr, len, type, triggered,
+					       -1, cpu, active);
 }
-EXPORT_SYMBOL_GPL(unregister_user_hw_breakpoint);
 
 /**
- * register_kernel_hw_breakpoint - register a hardware breakpoint for kernel space
- * @bp: the breakpoint structure to register
- *
- * @bp.info->name or @bp.info->address, @bp.info->len, @bp.info->type and
- * @bp->triggered must be set properly before invocation
+ * register_wide_hw_breakpoint - register a wide breakpoint in the kernel
+ * @addr: is the memory address that triggers the breakpoint
+ * @len: the length of the access to the memory (1 byte, 2 bytes etc...)
+ * @type: the type of the access to the memory (read/write/exec)
+ * @triggered: callback to trigger when we hit the breakpoint
+ * @active: should we activate it while registering it
  *
+ * @return a set of per_cpu pointers to perf events
  */
-int register_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+struct perf_event **
+register_wide_hw_breakpoint(unsigned long addr,
+			    int len,
+			    int type,
+			    perf_callback_t triggered,
+			    bool active)
 {
-	int rc;
+	struct perf_event **cpu_events, **pevent, *bp;
+	long err;
+	int cpu;
+
+	cpu_events = alloc_percpu(typeof(*cpu_events));
+	if (!cpu_events)
+		return ERR_PTR(-ENOMEM);
 
-	rc = arch_validate_hwbkpt_settings(bp, NULL);
-	if (rc)
-		return rc;
+	for_each_possible_cpu(cpu) {
+		pevent = per_cpu_ptr(cpu_events, cpu);
+		bp = register_kernel_hw_breakpoint_cpu(addr, len, type,
+					triggered, cpu, active);
 
-	spin_lock_bh(&hw_breakpoint_lock);
+		*pevent = bp;
 
-	rc = -ENOSPC;
-	/* Check if we are over-committing */
-	if ((hbp_kernel_pos > 0) && (!hbp_user_refcount[hbp_kernel_pos-1])) {
-		hbp_kernel_pos--;
-		hbp_kernel[hbp_kernel_pos] = bp;
-		on_each_cpu(arch_update_kernel_hw_breakpoint, NULL, 1);
-		rc = 0;
+		if (IS_ERR(bp) || !bp) {
+			err = PTR_ERR(bp);
+			goto fail;
+		}
 	}
 
-	spin_unlock_bh(&hw_breakpoint_lock);
-	return rc;
+	return cpu_events;
+
+fail:
+	for_each_possible_cpu(cpu) {
+		pevent = per_cpu_ptr(cpu_events, cpu);
+		if (IS_ERR(*pevent) || !*pevent)
+			break;
+		unregister_hw_breakpoint(*pevent);
+	}
+	free_percpu(cpu_events);
+	/* return the error if any */
+	return ERR_PTR(err);
 }
-EXPORT_SYMBOL_GPL(register_kernel_hw_breakpoint);
 
 /**
- * unregister_kernel_hw_breakpoint - unregister a HW breakpoint for kernel space
- * @bp: the breakpoint structure to unregister
- *
- * Uninstalls and unregisters @bp.
+ * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
+ * @cpu_events: the per cpu set of events to unregister
  */
-void unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp)
+void unregister_wide_hw_breakpoint(struct perf_event **cpu_events)
 {
-	int i, j;
-
-	spin_lock_bh(&hw_breakpoint_lock);
-
-	/* Find the 'bp' in our list of breakpoints for kernel */
-	for (i = hbp_kernel_pos; i < HBP_NUM; i++)
-		if (bp == hbp_kernel[i])
-			break;
+	int cpu;
+	struct perf_event **pevent;
 
-	/* Check if we did not find a match for 'bp'. If so return early */
-	if (i == HBP_NUM) {
-		spin_unlock_bh(&hw_breakpoint_lock);
-		return;
+	for_each_possible_cpu(cpu) {
+		pevent = per_cpu_ptr(cpu_events, cpu);
+		unregister_hw_breakpoint(*pevent);
 	}
-
-	/*
-	 * We'll shift the breakpoints one-level above to compact if
-	 * unregistration creates a hole
-	 */
-	for (j = i; j > hbp_kernel_pos; j--)
-		hbp_kernel[j] = hbp_kernel[j-1];
-
-	hbp_kernel[hbp_kernel_pos] = NULL;
-	on_each_cpu(arch_update_kernel_hw_breakpoint, NULL, 1);
-	hbp_kernel_pos++;
-
-	spin_unlock_bh(&hw_breakpoint_lock);
+	free_percpu(cpu_events);
 }
-EXPORT_SYMBOL_GPL(unregister_kernel_hw_breakpoint);
+
 
 static struct notifier_block hw_breakpoint_exceptions_nb = {
 	.notifier_call = hw_breakpoint_exceptions_notify,
@@ -374,5 +283,12 @@ static int __init init_hw_breakpoint(void)
 {
 	return register_die_notifier(&hw_breakpoint_exceptions_nb);
 }
-
 core_initcall(init_hw_breakpoint);
+
+
+struct pmu perf_ops_bp = {
+	.enable		= arch_install_hw_breakpoint,
+	.disable	= arch_uninstall_hw_breakpoint,
+	.read		= hw_breakpoint_pmu_read,
+	.unthrottle	= hw_breakpoint_pmu_unthrottle
+};
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 5087125e2a00..98dc56b2ebe4 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -29,6 +29,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/perf_event.h>
 #include <linux/ftrace_event.h>
+#include <linux/hw_breakpoint.h>
 
 #include <asm/irq_regs.h>
 
@@ -4229,6 +4230,51 @@ static void perf_event_free_filter(struct perf_event *event)
 
 #endif /* CONFIG_EVENT_PROFILE */
 
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+static void bp_perf_event_destroy(struct perf_event *event)
+{
+	release_bp_slot(event);
+}
+
+static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+{
+	int err;
+	/*
+	 * The breakpoint is already filled if we haven't created the counter
+	 * through perf syscall
+	 * FIXME: manage to get trigerred to NULL if it comes from syscalls
+	 */
+	if (!bp->callback)
+		err = register_perf_hw_breakpoint(bp);
+	else
+		err = __register_perf_hw_breakpoint(bp);
+	if (err)
+		return ERR_PTR(err);
+
+	bp->destroy = bp_perf_event_destroy;
+
+	return &perf_ops_bp;
+}
+
+void perf_bp_event(struct perf_event *bp, void *regs)
+{
+	/* TODO */
+}
+#else
+static void bp_perf_event_destroy(struct perf_event *event)
+{
+}
+
+static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+{
+	return NULL;
+}
+
+void perf_bp_event(struct perf_event *bp, void *regs)
+{
+}
+#endif
+
 atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
 
 static void sw_perf_event_destroy(struct perf_event *event)
@@ -4375,6 +4421,11 @@ perf_event_alloc(struct perf_event_attr *attr,
 		pmu = tp_perf_event_init(event);
 		break;
 
+	case PERF_TYPE_BREAKPOINT:
+		pmu = bp_perf_event_init(event);
+		break;
+
+
 	default:
 		break;
 	}
@@ -4686,7 +4737,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 
 	ctx = find_get_context(pid, cpu);
 	if (IS_ERR(ctx))
-		return NULL ;
+		return NULL;
 
 	event = perf_event_alloc(attr, cpu, ctx, NULL,
 				     NULL, callback, GFP_KERNEL);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 91c3d0e9a5a1..d72f06ff263f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,14 +11,11 @@
 #include <linux/ftrace.h>
 #include <trace/boot.h>
 #include <linux/kmemtrace.h>
+#include <linux/hw_breakpoint.h>
 
 #include <linux/trace_seq.h>
 #include <linux/ftrace_event.h>
 
-#ifdef CONFIG_KSYM_TRACER
-#include <asm/hw_breakpoint.h>
-#endif
-
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
 
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e19747d4f860..c16a08f399df 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -372,11 +372,11 @@ FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
 	F_STRUCT(
 		__field(	unsigned long,	ip			  )
 		__field(	unsigned char,	type			  )
-		__array(	char	     ,	ksym_name, KSYM_NAME_LEN  )
 		__array(	char	     ,	cmd,	   TASK_COMM_LEN  )
+		__field(	unsigned long,  addr			  )
 	),
 
-	F_printk("ip: %pF type: %d ksym_name: %s cmd: %s",
+	F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
 		(void *)__entry->ip, (unsigned int)__entry->type,
-		__entry->ksym_name, __entry->cmd)
+		(void *)__entry->addr,  __entry->cmd)
 );
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index 6d5609c67378..fea83eeeef09 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -29,7 +29,11 @@
 #include "trace_stat.h"
 #include "trace.h"
 
-/* For now, let us restrict the no. of symbols traced simultaneously to number
+#include <linux/hw_breakpoint.h>
+#include <asm/hw_breakpoint.h>
+
+/*
+ * For now, let us restrict the no. of symbols traced simultaneously to number
  * of available hardware breakpoint registers.
  */
 #define KSYM_TRACER_MAX HBP_NUM
@@ -37,8 +41,10 @@
 #define KSYM_TRACER_OP_LEN 3 /* rw- */
 
 struct trace_ksym {
-	struct hw_breakpoint	*ksym_hbp;
+	struct perf_event	**ksym_hbp;
 	unsigned long		ksym_addr;
+	int			type;
+	int			len;
 #ifdef CONFIG_PROFILE_KSYM_TRACER
 	unsigned long		counter;
 #endif
@@ -75,10 +81,11 @@ void ksym_collect_stats(unsigned long hbp_hit_addr)
 }
 #endif /* CONFIG_PROFILE_KSYM_TRACER */
 
-void ksym_hbp_handler(struct hw_breakpoint *hbp, struct pt_regs *regs)
+void ksym_hbp_handler(struct perf_event *hbp, void *data)
 {
 	struct ring_buffer_event *event;
 	struct ksym_trace_entry *entry;
+	struct pt_regs *regs = data;
 	struct ring_buffer *buffer;
 	int pc;
 
@@ -96,12 +103,12 @@ void ksym_hbp_handler(struct hw_breakpoint *hbp, struct pt_regs *regs)
 
 	entry		= ring_buffer_event_data(event);
 	entry->ip	= instruction_pointer(regs);
-	entry->type	= hbp->info.type;
-	strlcpy(entry->ksym_name, hbp->info.name, KSYM_SYMBOL_LEN);
+	entry->type	= hw_breakpoint_type(hbp);
+	entry->addr	= hw_breakpoint_addr(hbp);
 	strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
 
 #ifdef CONFIG_PROFILE_KSYM_TRACER
-	ksym_collect_stats(hbp->info.address);
+	ksym_collect_stats(hw_breakpoint_addr(hbp));
 #endif /* CONFIG_PROFILE_KSYM_TRACER */
 
 	trace_buffer_unlock_commit(buffer, event, 0, pc);
@@ -120,31 +127,21 @@ static int ksym_trace_get_access_type(char *str)
 	int access = 0;
 
 	if (str[0] == 'r')
-		access += 4;
-	else if (str[0] != '-')
-		return -EINVAL;
+		access |= HW_BREAKPOINT_R;
 
 	if (str[1] == 'w')
-		access += 2;
-	else if (str[1] != '-')
-		return -EINVAL;
+		access |= HW_BREAKPOINT_W;
 
-	if (str[2] != '-')
-		return -EINVAL;
+	if (str[2] == 'x')
+		access |= HW_BREAKPOINT_X;
 
 	switch (access) {
-	case 6:
-		access = HW_BREAKPOINT_RW;
-		break;
-	case 4:
-		access = -EINVAL;
-		break;
-	case 2:
-		access = HW_BREAKPOINT_WRITE;
-		break;
+	case HW_BREAKPOINT_W:
+	case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
+		return access;
+	default:
+		return -EINVAL;
 	}
-
-	return access;
 }
 
 /*
@@ -194,36 +191,33 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
 	if (!entry)
 		return -ENOMEM;
 
-	entry->ksym_hbp = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL);
-	if (!entry->ksym_hbp)
-		goto err;
-
-	entry->ksym_hbp->info.name = kstrdup(ksymname, GFP_KERNEL);
-	if (!entry->ksym_hbp->info.name)
-		goto err;
-
-	entry->ksym_hbp->info.type = op;
-	entry->ksym_addr = entry->ksym_hbp->info.address = addr;
-#ifdef CONFIG_X86
-	entry->ksym_hbp->info.len = HW_BREAKPOINT_LEN_4;
-#endif
-	entry->ksym_hbp->triggered = (void *)ksym_hbp_handler;
+	entry->type = op;
+	entry->ksym_addr = addr;
+	entry->len = HW_BREAKPOINT_LEN_4;
+
+	ret = -EAGAIN;
+	entry->ksym_hbp = register_wide_hw_breakpoint(entry->ksym_addr,
+					entry->len, entry->type,
+					ksym_hbp_handler, true);
+	if (IS_ERR(entry->ksym_hbp)) {
+		entry->ksym_hbp = NULL;
+		ret = PTR_ERR(entry->ksym_hbp);
+	}
 
-	ret = register_kernel_hw_breakpoint(entry->ksym_hbp);
-	if (ret < 0) {
+	if (!entry->ksym_hbp) {
 		printk(KERN_INFO "ksym_tracer request failed. Try again"
 					" later!!\n");
-		ret = -EAGAIN;
 		goto err;
 	}
+
 	hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
 	ksym_filter_entry_count++;
+
 	return 0;
+
 err:
-	if (entry->ksym_hbp)
-		kfree(entry->ksym_hbp->info.name);
-	kfree(entry->ksym_hbp);
 	kfree(entry);
+
 	return ret;
 }
 
@@ -244,10 +238,10 @@ static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
 	mutex_lock(&ksym_tracer_mutex);
 
 	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
-		ret = trace_seq_printf(s, "%s:", entry->ksym_hbp->info.name);
-		if (entry->ksym_hbp->info.type == HW_BREAKPOINT_WRITE)
+		ret = trace_seq_printf(s, "%pS:", (void *)entry->ksym_addr);
+		if (entry->type == HW_BREAKPOINT_W)
 			ret = trace_seq_puts(s, "-w-\n");
-		else if (entry->ksym_hbp->info.type == HW_BREAKPOINT_RW)
+		else if (entry->type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
 			ret = trace_seq_puts(s, "rw-\n");
 		WARN_ON_ONCE(!ret);
 	}
@@ -269,12 +263,10 @@ static void __ksym_trace_reset(void)
 	mutex_lock(&ksym_tracer_mutex);
 	hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
 								ksym_hlist) {
-		unregister_kernel_hw_breakpoint(entry->ksym_hbp);
+		unregister_wide_hw_breakpoint(entry->ksym_hbp);
 		ksym_filter_entry_count--;
 		hlist_del_rcu(&(entry->ksym_hlist));
 		synchronize_rcu();
-		kfree(entry->ksym_hbp->info.name);
-		kfree(entry->ksym_hbp);
 		kfree(entry);
 	}
 	mutex_unlock(&ksym_tracer_mutex);
@@ -327,7 +319,7 @@ static ssize_t ksym_trace_filter_write(struct file *file,
 	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
 		if (entry->ksym_addr == ksym_addr) {
 			/* Check for malformed request: (6) */
-			if (entry->ksym_hbp->info.type != op)
+			if (entry->type != op)
 				changed = 1;
 			else
 				goto out;
@@ -335,18 +327,21 @@ static ssize_t ksym_trace_filter_write(struct file *file,
 		}
 	}
 	if (changed) {
-		unregister_kernel_hw_breakpoint(entry->ksym_hbp);
-		entry->ksym_hbp->info.type = op;
+		unregister_wide_hw_breakpoint(entry->ksym_hbp);
+		entry->type = op;
 		if (op > 0) {
-			ret = register_kernel_hw_breakpoint(entry->ksym_hbp);
-			if (ret == 0)
+			entry->ksym_hbp =
+				register_wide_hw_breakpoint(entry->ksym_addr,
+					entry->len, entry->type,
+					ksym_hbp_handler, true);
+			if (IS_ERR(entry->ksym_hbp))
+				entry->ksym_hbp = NULL;
+			if (!entry->ksym_hbp)
 				goto out;
 		}
 		ksym_filter_entry_count--;
 		hlist_del_rcu(&(entry->ksym_hlist));
 		synchronize_rcu();
-		kfree(entry->ksym_hbp->info.name);
-		kfree(entry->ksym_hbp);
 		kfree(entry);
 		ret = 0;
 		goto out;
@@ -413,16 +408,16 @@ static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
 
 	trace_assign_type(field, entry);
 
-	ret = trace_seq_printf(s, "%11s-%-5d [%03d] %-30s ", field->cmd,
-				entry->pid, iter->cpu, field->ksym_name);
+	ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
+				entry->pid, iter->cpu, (char *)field->addr);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	switch (field->type) {
-	case HW_BREAKPOINT_WRITE:
+	case HW_BREAKPOINT_W:
 		ret = trace_seq_printf(s, " W  ");
 		break;
-	case HW_BREAKPOINT_RW:
+	case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
 		ret = trace_seq_printf(s, " RW ");
 		break;
 	default:
@@ -490,14 +485,13 @@ static int ksym_tracer_stat_show(struct seq_file *m, void *v)
 
 	entry = hlist_entry(stat, struct trace_ksym, ksym_hlist);
 
-	if (entry->ksym_hbp)
-		access_type = entry->ksym_hbp->info.type;
+	access_type = entry->type;
 
 	switch (access_type) {
-	case HW_BREAKPOINT_WRITE:
+	case HW_BREAKPOINT_W:
 		seq_puts(m, "  W           ");
 		break;
-	case HW_BREAKPOINT_RW:
+	case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
 		seq_puts(m, "  RW          ");
 		break;
 	default:
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 7179c12e4f0f..27c5072c2e6b 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -828,7 +828,8 @@ trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
 
 	ksym_selftest_dummy = 0;
 	/* Register the read-write tracing request */
-	ret = process_new_ksym_entry(KSYM_SELFTEST_ENTRY, HW_BREAKPOINT_RW,
+	ret = process_new_ksym_entry(KSYM_SELFTEST_ENTRY,
+				     HW_BREAKPOINT_R | HW_BREAKPOINT_W,
 					(unsigned long)(&ksym_selftest_dummy));
 
 	if (ret < 0) {
-- 
cgit v1.2.3


From d4cada4ae1c012815f95fa507eb86a0ae9d607d7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sun, 8 Nov 2009 10:17:30 +0000
Subject: udp: split sk_hash into two u16 hashes

Union sk_hash with two u16 hashes for udp (no extra memory taken)

One 16 bits hash on (local port) value (the previous udp 'hash')

One 16 bits hash on (local address, local port) values, initialized
but not yet used. This second hash is using jenkin hash for better
distribution.

Because the 'port' is xored later, a partial hash is performed
on local address + net_hash_mix(net)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h |  2 ++
 include/net/sock.h  |  6 +++++-
 net/ipv4/udp.c      | 25 +++++++++++++++++++------
 net/ipv6/udp.c      | 27 +++++++++++++++++++++++++--
 4 files changed, 51 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 832361e3e596..5b4b5274e683 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -55,6 +55,8 @@ static inline int udp_hashfn(struct net *net, unsigned num, unsigned mask)
 struct udp_sock {
 	/* inet_sock has to be the first member */
 	struct inet_sock inet;
+#define udp_port_hash		inet.sk.__sk_common.skc_u16hashes[0]
+#define udp_portaddr_hash	inet.sk.__sk_common.skc_u16hashes[1]
 	int		 pending;	/* Any pending frames ? */
 	unsigned int	 corkflag;	/* Cork is required */
   	__u16		 encap_type;	/* Is this an Encapsulation socket? */
diff --git a/include/net/sock.h b/include/net/sock.h
index 55de3bd719a5..827366b62680 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -109,6 +109,7 @@ struct net;
  *	@skc_refcnt: reference count
  *	@skc_tx_queue_mapping: tx queue number for this connection
  *	@skc_hash: hash value used with various protocol lookup tables
+ *	@skc_u16hashes: two u16 hash values used by UDP lookup tables
  *	@skc_family: network address family
  *	@skc_state: Connection state
  *	@skc_reuse: %SO_REUSEADDR setting
@@ -131,7 +132,10 @@ struct sock_common {
 	atomic_t		skc_refcnt;
 	int			skc_tx_queue_mapping;
 
-	unsigned int		skc_hash;
+	union  {
+		unsigned int	skc_hash;
+		__u16		skc_u16hashes[2];
+	};
 	unsigned short		skc_family;
 	volatile unsigned char	skc_state;
 	unsigned char		skc_reuse;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ffc837643a04..af72de1c8690 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -138,13 +138,14 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
 	sk_nulls_for_each(sk2, node, &hslot->head)
 		if (net_eq(sock_net(sk2), net)			&&
 		    sk2 != sk					&&
-		    (bitmap || sk2->sk_hash == num)		&&
+		    (bitmap || udp_sk(sk2)->udp_port_hash == num) &&
 		    (!sk2->sk_reuse || !sk->sk_reuse)		&&
 		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if
 			|| sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
 		    (*saddr_comp)(sk, sk2)) {
 			if (bitmap)
-				__set_bit(sk2->sk_hash >> log, bitmap);
+				__set_bit(udp_sk(sk2)->udp_port_hash >> log,
+					  bitmap);
 			else
 				return 1;
 		}
@@ -215,7 +216,8 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 	}
 found:
 	inet_sk(sk)->inet_num = snum;
-	sk->sk_hash = snum;
+	udp_sk(sk)->udp_port_hash = snum;
+	udp_sk(sk)->udp_portaddr_hash ^= snum;
 	if (sk_unhashed(sk)) {
 		sk_nulls_add_node_rcu(sk, &hslot->head);
 		hslot->count++;
@@ -238,8 +240,19 @@ static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
 		   inet1->inet_rcv_saddr == inet2->inet_rcv_saddr));
 }
 
+static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr,
+				       unsigned int port)
+{
+	return jhash_1word(saddr, net_hash_mix(net)) ^ port;
+}
+
 int udp_v4_get_port(struct sock *sk, unsigned short snum)
 {
+	/* precompute partial secondary hash */
+	udp_sk(sk)->udp_portaddr_hash =
+		udp4_portaddr_hash(sock_net(sk),
+				   inet_sk(sk)->inet_rcv_saddr,
+				   0);
 	return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal);
 }
 
@@ -249,7 +262,7 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
 {
 	int score = -1;
 
-	if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
+	if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum &&
 			!ipv6_only_sock(sk)) {
 		struct inet_sock *inet = inet_sk(sk);
 
@@ -360,7 +373,7 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
 		struct inet_sock *inet = inet_sk(s);
 
 		if (!net_eq(sock_net(s), net)				||
-		    s->sk_hash != hnum					||
+		    udp_sk(s)->udp_port_hash != hnum			||
 		    (inet->inet_daddr && inet->inet_daddr != rmt_addr)	||
 		    (inet->inet_dport != rmt_port && inet->inet_dport)	||
 		    (inet->inet_rcv_saddr	&&
@@ -1050,7 +1063,7 @@ void udp_lib_unhash(struct sock *sk)
 	if (sk_hashed(sk)) {
 		struct udp_table *udptable = sk->sk_prot->h.udp_table;
 		struct udp_hslot *hslot = udp_hashslot(udptable, sock_net(sk),
-						     sk->sk_hash);
+						       udp_sk(sk)->udp_port_hash);
 
 		spin_lock_bh(&hslot->lock);
 		if (sk_nulls_del_node_init_rcu(sk)) {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 5bc7cdbf030a..1e5fadd997b7 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -81,8 +81,30 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
 	return 0;
 }
 
+static unsigned int udp6_portaddr_hash(struct net *net,
+				       const struct in6_addr *addr6,
+				       unsigned int port)
+{
+	unsigned int hash, mix = net_hash_mix(net);
+
+	if (ipv6_addr_any(addr6))
+		hash = jhash_1word(0, mix);
+	else if (ipv6_addr_type(addr6) == IPV6_ADDR_MAPPED)
+		hash = jhash_1word(addr6->s6_addr32[3], mix);
+	else
+		hash = jhash2(addr6->s6_addr32, 4, mix);
+
+	return hash ^ port;
+}
+
+
 int udp_v6_get_port(struct sock *sk, unsigned short snum)
 {
+	/* precompute partial secondary hash */
+	udp_sk(sk)->udp_portaddr_hash =
+		udp6_portaddr_hash(sock_net(sk),
+				   &inet6_sk(sk)->rcv_saddr,
+				   0);
 	return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal);
 }
 
@@ -94,7 +116,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
 {
 	int score = -1;
 
-	if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
+	if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum &&
 			sk->sk_family == PF_INET6) {
 		struct ipv6_pinfo *np = inet6_sk(sk);
 		struct inet_sock *inet = inet_sk(sk);
@@ -415,7 +437,8 @@ static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk,
 		if (!net_eq(sock_net(s), net))
 			continue;
 
-		if (s->sk_hash == num && s->sk_family == PF_INET6) {
+		if (udp_sk(s)->udp_port_hash == num &&
+		    s->sk_family == PF_INET6) {
 			struct ipv6_pinfo *np = inet6_sk(s);
 			if (inet->inet_dport) {
 				if (inet->inet_dport != rmt_port)
-- 
cgit v1.2.3


From 512615b6b843ff3ff5ad583f34c39b3f302f5f26 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sun, 8 Nov 2009 10:17:58 +0000
Subject: udp: secondary hash on (local port, local address)

Extends udp_table to contain a secondary hash table.

socket anchor for this second hash is free, because UDP
doesnt use skc_bind_node : We define an union to hold
both skc_bind_node & a new hlist_nulls_node udp_portaddr_node

udp_lib_get_port() inserts sockets into second hash chain
(additional cost of one atomic op)

udp_lib_unhash() deletes socket from second hash chain
(additional cost of one atomic op)

Note : No spinlock lockdep annotation is needed, because
lock for the secondary hash chain is always get after
lock for primary hash chain.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h |  1 +
 include/net/sock.h  |  8 ++++++--
 include/net/udp.h   | 22 ++++++++++++++++++++--
 net/ipv4/udp.c      | 31 ++++++++++++++++++++++++++-----
 4 files changed, 53 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 5b4b5274e683..59f0ddf2d284 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -57,6 +57,7 @@ struct udp_sock {
 	struct inet_sock inet;
 #define udp_port_hash		inet.sk.__sk_common.skc_u16hashes[0]
 #define udp_portaddr_hash	inet.sk.__sk_common.skc_u16hashes[1]
+#define udp_portaddr_node	inet.sk.__sk_common.skc_portaddr_node
 	int		 pending;	/* Any pending frames ? */
 	unsigned int	 corkflag;	/* Cork is required */
   	__u16		 encap_type;	/* Is this an Encapsulation socket? */
diff --git a/include/net/sock.h b/include/net/sock.h
index 827366b62680..3f1a4804bb3f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -105,7 +105,7 @@ struct net;
 /**
  *	struct sock_common - minimal network layer representation of sockets
  *	@skc_node: main hash linkage for various protocol lookup tables
- *	@skc_nulls_node: main hash linkage for UDP/UDP-Lite protocol
+ *	@skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
  *	@skc_refcnt: reference count
  *	@skc_tx_queue_mapping: tx queue number for this connection
  *	@skc_hash: hash value used with various protocol lookup tables
@@ -115,6 +115,7 @@ struct net;
  *	@skc_reuse: %SO_REUSEADDR setting
  *	@skc_bound_dev_if: bound device index if != 0
  *	@skc_bind_node: bind hash linkage for various protocol lookup tables
+ *	@skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
  *	@skc_prot: protocol handlers inside a network family
  *	@skc_net: reference to the network namespace of this socket
  *
@@ -140,7 +141,10 @@ struct sock_common {
 	volatile unsigned char	skc_state;
 	unsigned char		skc_reuse;
 	int			skc_bound_dev_if;
-	struct hlist_node	skc_bind_node;
+	union {
+		struct hlist_node	skc_bind_node;
+		struct hlist_nulls_node skc_portaddr_node;
+	};
 	struct proto		*skc_prot;
 #ifdef CONFIG_NET_NS
 	struct net	 	*skc_net;
diff --git a/include/net/udp.h b/include/net/udp.h
index 9167281e47dc..af41850f742a 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -63,10 +63,19 @@ struct udp_hslot {
 	spinlock_t		lock;
 } __attribute__((aligned(2 * sizeof(long))));
 
+/**
+ *	struct udp_table - UDP table
+ *
+ *	@hash:	hash table, sockets are hashed on (local port)
+ *	@hash2:	hash table, sockets are hashed on (local port, local address)
+ *	@mask:	number of slots in hash tables, minus 1
+ *	@log:	log2(number of slots in hash table)
+ */
 struct udp_table {
 	struct udp_hslot	*hash;
-	unsigned int mask;
-	unsigned int log;
+	struct udp_hslot	*hash2;
+	unsigned int		mask;
+	unsigned int		log;
 };
 extern struct udp_table udp_table;
 extern void udp_table_init(struct udp_table *, const char *);
@@ -75,6 +84,15 @@ static inline struct udp_hslot *udp_hashslot(struct udp_table *table,
 {
 	return &table->hash[udp_hashfn(net, num, table->mask)];
 }
+/*
+ * For secondary hash, net_hash_mix() is performed before calling
+ * udp_hashslot2(), this explains difference with udp_hashslot()
+ */
+static inline struct udp_hslot *udp_hashslot2(struct udp_table *table,
+					      unsigned int hash)
+{
+	return &table->hash2[hash & table->mask];
+}
 
 /* Note: this must match 'valbool' in sock_setsockopt */
 #define UDP_CSUM_NOXMIT		1
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index af72de1c8690..5f04216f35ce 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -163,7 +163,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 		       int (*saddr_comp)(const struct sock *sk1,
 					 const struct sock *sk2))
 {
-	struct udp_hslot *hslot;
+	struct udp_hslot *hslot, *hslot2;
 	struct udp_table *udptable = sk->sk_prot->h.udp_table;
 	int    error = 1;
 	struct net *net = sock_net(sk);
@@ -222,6 +222,13 @@ found:
 		sk_nulls_add_node_rcu(sk, &hslot->head);
 		hslot->count++;
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+
+		hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+		spin_lock(&hslot2->lock);
+		hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+					 &hslot2->head);
+		hslot2->count++;
+		spin_unlock(&hslot2->lock);
 	}
 	error = 0;
 fail_unlock:
@@ -1062,14 +1069,22 @@ void udp_lib_unhash(struct sock *sk)
 {
 	if (sk_hashed(sk)) {
 		struct udp_table *udptable = sk->sk_prot->h.udp_table;
-		struct udp_hslot *hslot = udp_hashslot(udptable, sock_net(sk),
-						       udp_sk(sk)->udp_port_hash);
+		struct udp_hslot *hslot, *hslot2;
+
+		hslot  = udp_hashslot(udptable, sock_net(sk),
+				      udp_sk(sk)->udp_port_hash);
+		hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
 
 		spin_lock_bh(&hslot->lock);
 		if (sk_nulls_del_node_init_rcu(sk)) {
 			hslot->count--;
 			inet_sk(sk)->inet_num = 0;
 			sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+
+			spin_lock(&hslot2->lock);
+			hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+			hslot2->count--;
+			spin_unlock(&hslot2->lock);
 		}
 		spin_unlock_bh(&hslot->lock);
 	}
@@ -1857,7 +1872,7 @@ void __init udp_table_init(struct udp_table *table, const char *name)
 
 	if (!CONFIG_BASE_SMALL)
 		table->hash = alloc_large_system_hash(name,
-			sizeof(struct udp_hslot),
+			2 * sizeof(struct udp_hslot),
 			uhash_entries,
 			21, /* one slot per 2 MB */
 			0,
@@ -1869,17 +1884,23 @@ void __init udp_table_init(struct udp_table *table, const char *name)
 	 */
 	if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) {
 		table->hash = kmalloc(UDP_HTABLE_SIZE_MIN *
-				      sizeof(struct udp_hslot), GFP_KERNEL);
+				      2 * sizeof(struct udp_hslot), GFP_KERNEL);
 		if (!table->hash)
 			panic(name);
 		table->log = ilog2(UDP_HTABLE_SIZE_MIN);
 		table->mask = UDP_HTABLE_SIZE_MIN - 1;
 	}
+	table->hash2 = table->hash + (table->mask + 1);
 	for (i = 0; i <= table->mask; i++) {
 		INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
 		table->hash[i].count = 0;
 		spin_lock_init(&table->hash[i].lock);
 	}
+	for (i = 0; i <= table->mask; i++) {
+		INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i);
+		table->hash2[i].count = 0;
+		spin_lock_init(&table->hash2[i].lock);
+	}
 }
 
 void __init udp_init(void)
-- 
cgit v1.2.3


From 7a50a240c495478179f01c9df4bd75e39cff79c7 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sun, 8 Nov 2009 20:57:03 -0800
Subject: net/compat_ioctl: support SIOCWANDEV

This adds compat_ioctl support for SIOCWANDEV, which has
always been missing.

The definition of struct compat_ifreq was missing an
ifru_settings fields that is needed to support SIOCWANDEV,
so add that and clean up the whitespace damage in the
struct definition.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/compat.h | 41 ++++++++++++++++++++++++-----------------
 net/socket.c           | 23 +++++++++++++++++++++++
 2 files changed, 47 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 224c7a896172..ef68119a4fd2 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -165,25 +165,32 @@ struct compat_ifmap {
 	unsigned char port;
 };
 
+struct compat_if_settings
+{
+	unsigned int type;	/* Type of physical device or protocol */
+	unsigned int size;	/* Size of the data allocated by the caller */
+	compat_uptr_t ifs_ifsu;	/* union of pointers */
+};
+
 struct compat_ifreq {
-        union {
-                char    ifrn_name[IFNAMSIZ];            /* if name, e.g. "en0" */
-        } ifr_ifrn;
-        union {
-                struct  sockaddr ifru_addr;
-                struct  sockaddr ifru_dstaddr;
-                struct  sockaddr ifru_broadaddr;
-                struct  sockaddr ifru_netmask;
-                struct  sockaddr ifru_hwaddr;
-                short   ifru_flags;
-                compat_int_t     ifru_ivalue;
-                compat_int_t     ifru_mtu;
-                struct  compat_ifmap ifru_map;
-                char    ifru_slave[IFNAMSIZ];   /* Just fits the size */
+	union {
+		char	ifrn_name[IFNAMSIZ];    /* if name, e.g. "en0" */
+	} ifr_ifrn;
+	union {
+		struct	sockaddr ifru_addr;
+		struct	sockaddr ifru_dstaddr;
+		struct	sockaddr ifru_broadaddr;
+		struct	sockaddr ifru_netmask;
+		struct	sockaddr ifru_hwaddr;
+		short	ifru_flags;
+		compat_int_t	ifru_ivalue;
+		compat_int_t	ifru_mtu;
+		struct	compat_ifmap ifru_map;
+		char	ifru_slave[IFNAMSIZ];   /* Just fits the size */
 		char	ifru_newname[IFNAMSIZ];
-                compat_caddr_t ifru_data;
-	    /* XXXX? ifru_settings should be here */
-        } ifr_ifru;
+		compat_caddr_t	ifru_data;
+		struct	compat_if_settings ifru_settings;
+	} ifr_ifru;
 };
 
 struct compat_ifconf {
diff --git a/net/socket.c b/net/socket.c
index 224e7f73fdf0..befd9f5b1620 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2627,6 +2627,27 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32)
 	return dev_ioctl(net, SIOCETHTOOL, ifr);
 }
 
+static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32)
+{
+	void __user *uptr;
+	compat_uptr_t uptr32;
+	struct ifreq __user *uifr;
+
+	uifr = compat_alloc_user_space(sizeof (*uifr));
+	if (copy_in_user(uifr, uifr32, sizeof(struct compat_ifreq)))
+		return -EFAULT;
+
+	if (get_user(uptr32, &uifr32->ifr_settings.ifs_ifsu))
+		return -EFAULT;
+
+	uptr = compat_ptr(uptr32);
+
+	if (put_user(uptr, &uifr->ifr_settings.ifs_ifsu.raw_hdlc))
+		return -EFAULT;
+
+	return dev_ioctl(net, SIOCWANDEV, uifr);
+}
+
 static int bond_ioctl(struct net *net, unsigned int cmd,
 			 struct compat_ifreq __user *ifr32)
 {
@@ -3058,6 +3079,8 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
 		return dev_ifconf(net, argp);
 	case SIOCETHTOOL:
 		return ethtool_ioctl(net, argp);
+	case SIOCWANDEV:
+		return compat_siocwandev(net, argp);
 	case SIOCBONDENSLAVE:
 	case SIOCBONDRELEASE:
 	case SIOCBONDSETHWADDR:
-- 
cgit v1.2.3


From dd8dbf2e6880e30c00b18600c962d0cb5a03c555 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Nov 2009 16:35:32 +1100
Subject: security: report the module name to security_module_request

For SELinux to do better filtering in userspace we send the name of the
module along with the AVC denial when a program is denied module_request.

Example output:

type=SYSCALL msg=audit(11/03/2009 10:59:43.510:9) : arch=x86_64 syscall=write success=yes exit=2 a0=3 a1=7fc28c0d56c0 a2=2 a3=7fffca0d7440 items=0 ppid=1727 pid=1729 auid=unset uid=root gid=root euid=root suid=root fsuid=root egid=root sgid=root fsgid=root tty=(none) ses=unset comm=rpc.nfsd exe=/usr/sbin/rpc.nfsd subj=system_u:system_r:nfsd_t:s0 key=(null)
type=AVC msg=audit(11/03/2009 10:59:43.510:9) : avc:  denied  { module_request } for  pid=1729 comm=rpc.nfsd kmod="net-pf-10" scontext=system_u:system_r:nfsd_t:s0 tcontext=system_u:system_r:kernel_t:s0 tclass=system

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/lsm_audit.h | 18 ++++++++++--------
 include/linux/security.h  |  7 ++++---
 kernel/kmod.c             |  8 ++++----
 security/capability.c     |  2 +-
 security/lsm_audit.c      |  4 ++++
 security/security.c       |  4 ++--
 security/selinux/hooks.c  | 13 +++++++++++--
 7 files changed, 36 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h
index 190c37854870..f78f83d7663f 100644
--- a/include/linux/lsm_audit.h
+++ b/include/linux/lsm_audit.h
@@ -26,14 +26,15 @@
 
 /* Auxiliary data to use in generating the audit record. */
 struct common_audit_data {
-	char    type;
-#define LSM_AUDIT_DATA_FS      1
-#define LSM_AUDIT_DATA_NET     2
-#define LSM_AUDIT_DATA_CAP     3
-#define LSM_AUDIT_DATA_IPC     4
-#define LSM_AUDIT_DATA_TASK    5
-#define LSM_AUDIT_DATA_KEY     6
-#define LSM_AUDIT_NO_AUDIT     7
+	char type;
+#define LSM_AUDIT_DATA_FS	1
+#define LSM_AUDIT_DATA_NET	2
+#define LSM_AUDIT_DATA_CAP	3
+#define LSM_AUDIT_DATA_IPC	4
+#define LSM_AUDIT_DATA_TASK	5
+#define LSM_AUDIT_DATA_KEY	6
+#define LSM_AUDIT_NO_AUDIT	7
+#define LSM_AUDIT_DATA_KMOD	8
 	struct task_struct *tsk;
 	union 	{
 		struct {
@@ -66,6 +67,7 @@ struct common_audit_data {
 			char *key_desc;
 		} key_struct;
 #endif
+		char *kmod_name;
 	} u;
 	/* this union contains LSM specific data */
 	union {
diff --git a/include/linux/security.h b/include/linux/security.h
index ed0faea60b82..466cbadbd1ef 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -706,6 +706,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  * @kernel_module_request:
  *	Ability to trigger the kernel to automatically upcall to userspace for
  *	userspace to load a kernel module with the given name.
+ *	@kmod_name name of the module requested by the kernel
  *	Return 0 if successful.
  * @task_setuid:
  *	Check permission before setting one or more of the user identity
@@ -1577,7 +1578,7 @@ struct security_operations {
 	void (*cred_transfer)(struct cred *new, const struct cred *old);
 	int (*kernel_act_as)(struct cred *new, u32 secid);
 	int (*kernel_create_files_as)(struct cred *new, struct inode *inode);
-	int (*kernel_module_request)(void);
+	int (*kernel_module_request)(char *kmod_name);
 	int (*task_setuid) (uid_t id0, uid_t id1, uid_t id2, int flags);
 	int (*task_fix_setuid) (struct cred *new, const struct cred *old,
 				int flags);
@@ -1842,7 +1843,7 @@ void security_commit_creds(struct cred *new, const struct cred *old);
 void security_transfer_creds(struct cred *new, const struct cred *old);
 int security_kernel_act_as(struct cred *new, u32 secid);
 int security_kernel_create_files_as(struct cred *new, struct inode *inode);
-int security_kernel_module_request(void);
+int security_kernel_module_request(char *kmod_name);
 int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags);
 int security_task_fix_setuid(struct cred *new, const struct cred *old,
 			     int flags);
@@ -2407,7 +2408,7 @@ static inline int security_kernel_create_files_as(struct cred *cred,
 	return 0;
 }
 
-static inline int security_kernel_module_request(void)
+static inline int security_kernel_module_request(char *kmod_name)
 {
 	return 0;
 }
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9fcb53a11f87..25b103190364 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -80,16 +80,16 @@ int __request_module(bool wait, const char *fmt, ...)
 #define MAX_KMOD_CONCURRENT 50	/* Completely arbitrary value - KAO */
 	static int kmod_loop_msg;
 
-	ret = security_kernel_module_request();
-	if (ret)
-		return ret;
-
 	va_start(args, fmt);
 	ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
 	va_end(args);
 	if (ret >= MODULE_NAME_LEN)
 		return -ENAMETOOLONG;
 
+	ret = security_kernel_module_request(module_name);
+	if (ret)
+		return ret;
+
 	/* If modprobe needs a service that is in a module, we get a recursive
 	 * loop.  Limit the number of running kmod threads to max_threads/2 or
 	 * MAX_KMOD_CONCURRENT, whichever is the smaller.  A cleaner method
diff --git a/security/capability.c b/security/capability.c
index 4f3ab476937f..5c700e1a4fd3 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -421,7 +421,7 @@ static int cap_kernel_create_files_as(struct cred *new, struct inode *inode)
 	return 0;
 }
 
-static int cap_kernel_module_request(void)
+static int cap_kernel_module_request(char *kmod_name)
 {
 	return 0;
 }
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index 3bb90b6f1dd3..51bd0fd9c9f0 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -354,6 +354,10 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 		}
 		break;
 #endif
+	case LSM_AUDIT_DATA_KMOD:
+		audit_log_format(ab, " kmod=");
+		audit_log_untrustedstring(ab, a->u.kmod_name);
+		break;
 	} /* switch (a->type) */
 }
 
diff --git a/security/security.c b/security/security.c
index aad71b2ca195..24e060be9fa5 100644
--- a/security/security.c
+++ b/security/security.c
@@ -764,9 +764,9 @@ int security_kernel_create_files_as(struct cred *new, struct inode *inode)
 	return security_ops->kernel_create_files_as(new, inode);
 }
 
-int security_kernel_module_request(void)
+int security_kernel_module_request(char *kmod_name)
 {
-	return security_ops->kernel_module_request();
+	return security_ops->kernel_module_request(kmod_name);
 }
 
 int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index a29d6612a328..c96d63ec4753 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3337,9 +3337,18 @@ static int selinux_kernel_create_files_as(struct cred *new, struct inode *inode)
 	return 0;
 }
 
-static int selinux_kernel_module_request(void)
+static int selinux_kernel_module_request(char *kmod_name)
 {
-	return task_has_system(current, SYSTEM__MODULE_REQUEST);
+	u32 sid;
+	struct common_audit_data ad;
+
+	sid = task_sid(current);
+
+	COMMON_AUDIT_DATA_INIT(&ad, KMOD);
+	ad.u.kmod_name = kmod_name;
+
+	return avc_has_perm(sid, SECINITSID_KERNEL, SECCLASS_SYSTEM,
+			    SYSTEM__MODULE_REQUEST, &ad);
 }
 
 static int selinux_task_setpgid(struct task_struct *p, pid_t pgid)
-- 
cgit v1.2.3


From 86b37281411cf1e9bc0a6b5406c45edb7bd9ea5d Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Tue, 10 Nov 2009 11:50:21 +0100
Subject: block: Expose discard granularity

While SSDs track block usage on a per-sector basis, RAID arrays often
have allocation blocks that are bigger.  Allow the discard granularity
and alignment to be set and teach the topology stacking logic how to
handle them.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c   | 46 ++++++++++++++++++++++++++++++++++++----------
 block/blk-sysfs.c      | 22 ++++++++++++++++++++++
 block/genhd.c          | 12 ++++++++++++
 fs/partitions/check.c  | 12 ++++++++++++
 include/linux/blkdev.h | 18 ++++++++++++++++++
 include/linux/genhd.h  |  1 +
 6 files changed, 101 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 66d4aa8799b7..7f986cafacd5 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -96,7 +96,10 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->max_segment_size = MAX_SEGMENT_SIZE;
 	lim->max_sectors = BLK_DEF_MAX_SECTORS;
 	lim->max_hw_sectors = INT_MAX;
-	lim->max_discard_sectors = SAFE_MAX_SECTORS;
+	lim->max_discard_sectors = 0;
+	lim->discard_granularity = 0;
+	lim->discard_alignment = 0;
+	lim->discard_misaligned = 0;
 	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
 	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
 	lim->alignment_offset = 0;
@@ -488,6 +491,16 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 }
 EXPORT_SYMBOL(blk_queue_stack_limits);
 
+static unsigned int lcm(unsigned int a, unsigned int b)
+{
+	if (a && b)
+		return (a * b) / gcd(a, b);
+	else if (b)
+		return b;
+
+	return a;
+}
+
 /**
  * blk_stack_limits - adjust queue_limits for stacked devices
  * @t:	the stacking driver limits (top)
@@ -502,6 +515,10 @@ EXPORT_SYMBOL(blk_queue_stack_limits);
 int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		     sector_t offset)
 {
+	int ret;
+
+	ret = 0;
+
 	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
 	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
 	t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
@@ -531,7 +548,13 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	if (offset &&
 	    (offset & (b->physical_block_size - 1)) != b->alignment_offset) {
 		t->misaligned = 1;
-		return -1;
+		ret = -1;
+	}
+
+	if (offset &&
+	    (offset & (b->discard_granularity - 1)) != b->discard_alignment) {
+		t->discard_misaligned = 1;
+		ret = -1;
 	}
 
 	/* If top has no alignment offset, inherit from bottom */
@@ -539,23 +562,26 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		t->alignment_offset =
 			b->alignment_offset & (b->physical_block_size - 1);
 
+	if (!t->discard_alignment)
+		t->discard_alignment =
+			b->discard_alignment & (b->discard_granularity - 1);
+
 	/* Top device aligned on logical block boundary? */
 	if (t->alignment_offset & (t->logical_block_size - 1)) {
 		t->misaligned = 1;
-		return -1;
+		ret = -1;
 	}
 
-	/* Find lcm() of optimal I/O size */
-	if (t->io_opt && b->io_opt)
-		t->io_opt = (t->io_opt * b->io_opt) / gcd(t->io_opt, b->io_opt);
-	else if (b->io_opt)
-		t->io_opt = b->io_opt;
+	/* Find lcm() of optimal I/O size and granularity */
+	t->io_opt = lcm(t->io_opt, b->io_opt);
+	t->discard_granularity = lcm(t->discard_granularity,
+				     b->discard_granularity);
 
 	/* Verify that optimal I/O size is a multiple of io_min */
 	if (t->io_min && t->io_opt % t->io_min)
-		return -1;
+		ret = -1;
 
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL(blk_stack_limits);
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 8a6d81afb284..3147145edc15 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -126,6 +126,16 @@ static ssize_t queue_io_opt_show(struct request_queue *q, char *page)
 	return queue_var_show(queue_io_opt(q), page);
 }
 
+static ssize_t queue_discard_granularity_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->limits.discard_granularity, page);
+}
+
+static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->limits.max_discard_sectors << 9, page);
+}
+
 static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 {
@@ -293,6 +303,16 @@ static struct queue_sysfs_entry queue_io_opt_entry = {
 	.show = queue_io_opt_show,
 };
 
+static struct queue_sysfs_entry queue_discard_granularity_entry = {
+	.attr = {.name = "discard_granularity", .mode = S_IRUGO },
+	.show = queue_discard_granularity_show,
+};
+
+static struct queue_sysfs_entry queue_discard_max_entry = {
+	.attr = {.name = "discard_max_bytes", .mode = S_IRUGO },
+	.show = queue_discard_max_show,
+};
+
 static struct queue_sysfs_entry queue_nonrot_entry = {
 	.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_nonrot_show,
@@ -328,6 +348,8 @@ static struct attribute *default_attrs[] = {
 	&queue_physical_block_size_entry.attr,
 	&queue_io_min_entry.attr,
 	&queue_io_opt_entry.attr,
+	&queue_discard_granularity_entry.attr,
+	&queue_discard_max_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
diff --git a/block/genhd.c b/block/genhd.c
index 517e4332cb37..b11a4ad7d571 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -861,12 +861,23 @@ static ssize_t disk_alignment_offset_show(struct device *dev,
 	return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
 }
 
+static ssize_t disk_discard_alignment_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return sprintf(buf, "%u\n", queue_discard_alignment(disk->queue));
+}
+
 static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
 static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
+static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
+		   NULL);
 static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
@@ -887,6 +898,7 @@ static struct attribute *disk_attrs[] = {
 	&dev_attr_ro.attr,
 	&dev_attr_size.attr,
 	&dev_attr_alignment_offset.attr,
+	&dev_attr_discard_alignment.attr,
 	&dev_attr_capability.attr,
 	&dev_attr_stat.attr,
 	&dev_attr_inflight.attr,
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7b685e10cbad..64bc8998ac9a 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -226,6 +226,13 @@ ssize_t part_alignment_offset_show(struct device *dev,
 	return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
 }
 
+ssize_t part_discard_alignment_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	return sprintf(buf, "%u\n", p->discard_alignment);
+}
+
 ssize_t part_stat_show(struct device *dev,
 		       struct device_attribute *attr, char *buf)
 {
@@ -288,6 +295,8 @@ static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
+static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
+		   NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -300,6 +309,7 @@ static struct attribute *part_attrs[] = {
 	&dev_attr_start.attr,
 	&dev_attr_size.attr,
 	&dev_attr_alignment_offset.attr,
+	&dev_attr_discard_alignment.attr,
 	&dev_attr_stat.attr,
 	&dev_attr_inflight.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -403,6 +413,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 
 	p->start_sect = start;
 	p->alignment_offset = queue_sector_alignment_offset(disk->queue, start);
+	p->discard_alignment = queue_sector_discard_alignment(disk->queue,
+							      start);
 	p->nr_sects = len;
 	p->partno = partno;
 	p->policy = get_disk_ro(disk);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 39c601f783a0..1cc02972fbe2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -312,12 +312,15 @@ struct queue_limits {
 	unsigned int		io_min;
 	unsigned int		io_opt;
 	unsigned int		max_discard_sectors;
+	unsigned int		discard_granularity;
+	unsigned int		discard_alignment;
 
 	unsigned short		logical_block_size;
 	unsigned short		max_hw_segments;
 	unsigned short		max_phys_segments;
 
 	unsigned char		misaligned;
+	unsigned char		discard_misaligned;
 	unsigned char		no_cluster;
 };
 
@@ -1121,6 +1124,21 @@ static inline int bdev_alignment_offset(struct block_device *bdev)
 	return q->limits.alignment_offset;
 }
 
+static inline int queue_discard_alignment(struct request_queue *q)
+{
+	if (q->limits.discard_misaligned)
+		return -1;
+
+	return q->limits.discard_alignment;
+}
+
+static inline int queue_sector_discard_alignment(struct request_queue *q,
+						 sector_t sector)
+{
+	return ((sector << 9) - q->limits.discard_alignment)
+		& (q->limits.discard_granularity - 1);
+}
+
 static inline int queue_dma_alignment(struct request_queue *q)
 {
 	return q ? q->dma_alignment : 511;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 297df45ffd0a..c6c0c41af35f 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -91,6 +91,7 @@ struct hd_struct {
 	sector_t start_sect;
 	sector_t nr_sects;
 	sector_t alignment_offset;
+	unsigned int discard_alignment;
 	struct device __dev;
 	struct kobject *holder_dir;
 	int policy, partno;
-- 
cgit v1.2.3


From 9d5ce73a64be2be8112147a3e0b551ad9cd1247b Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 10 Nov 2009 19:46:16 +0900
Subject: x86: intel-iommu: Convert detect_intel_iommu to use iommu_init hook

This changes detect_intel_iommu() to set intel_iommu_init() to
iommu_init hook if detect_intel_iommu() finds the IOMMU.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: chrisw@sous-sol.org
Cc: dwmw2@infradead.org
Cc: joerg.roedel@amd.com
Cc: muli@il.ibm.com
LKML-Reference: <1257849980-22640-6-git-send-email-fujita.tomonori@lab.ntt.co.jp>
[ -v2: build fix for the !CONFIG_DMAR case ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/pci-dma.c |  2 --
 drivers/pci/dmar.c        |  4 ++++
 include/linux/dmar.h      | 15 ++++-----------
 3 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 5ca44a9301a0..bed05e2e5890 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -294,8 +294,6 @@ static int __init pci_iommu_init(void)
 
 	x86_init.iommu.iommu_init();
 
-	intel_iommu_init();
-
 	no_iommu_init();
 	return 0;
 }
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 22b02c6df854..bce9cd7c755a 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -616,6 +616,10 @@ void __init detect_intel_iommu(void)
 		if (ret && !no_iommu && !iommu_detected && !swiotlb &&
 		    !dmar_disabled)
 			iommu_detected = 1;
+#endif
+#ifdef CONFIG_X86
+		if (ret)
+			x86_init.iommu.iommu_init = intel_iommu_init;
 #endif
 	}
 	early_acpi_os_unmap_memory(dmar_tbl, dmar_tbl_size);
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 4a2b162c256a..5de4c9e5856d 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -208,16 +208,9 @@ struct dmar_atsr_unit {
 	u8 include_all:1;		/* include all ports */
 };
 
-/* Intel DMAR  initialization functions */
 extern int intel_iommu_init(void);
-#else
-static inline int intel_iommu_init(void)
-{
-#ifdef CONFIG_INTR_REMAP
-	return dmar_dev_scope_init();
-#else
-	return -ENODEV;
-#endif
-}
-#endif /* !CONFIG_DMAR */
+#else /* !CONFIG_DMAR: */
+static inline int intel_iommu_init(void) { return -ENODEV; }
+#endif /* CONFIG_DMAR */
+
 #endif /* __DMAR_H__ */
-- 
cgit v1.2.3


From 9f993ac3f708b661207ed7de521f245586217a68 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 10 Nov 2009 19:46:17 +0900
Subject: bootmem: Add free_bootmem_late()

Add a new function for freeing bootmem after the bootmem
allocator has been released and the unreserved pages given to
the page allocator.

This allows us to reserve bootmem and then release it if we
later discover it was not needed.

( This new API will be used by the swiotlb code to recover
  a significant amount of RAM (64MB). )

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: chrisw@sous-sol.org
Cc: dwmw2@infradead.org
Cc: joerg.roedel@amd.com
Cc: muli@il.ibm.com
Cc: hannes@cmpxchg.org
Cc: tj@kernel.org
Cc: akpm@linux-foundation.org
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1257849980-22640-7-git-send-email-fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/bootmem.h |  1 +
 mm/bootmem.c            | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index dd97fb8408a8..b10ec49ee2dd 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -53,6 +53,7 @@ extern void free_bootmem_node(pg_data_t *pgdat,
 			      unsigned long addr,
 			      unsigned long size);
 extern void free_bootmem(unsigned long addr, unsigned long size);
+extern void free_bootmem_late(unsigned long addr, unsigned long size);
 
 /*
  * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 555d5d2731c6..d1dc23cc7f10 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -143,6 +143,30 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
 	return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
 }
 
+/*
+ * free_bootmem_late - free bootmem pages directly to page allocator
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * This is only useful when the bootmem allocator has already been torn
+ * down, but we are still initializing the system.  Pages are given directly
+ * to the page allocator, no bootmem metadata is updated because it is gone.
+ */
+void __init free_bootmem_late(unsigned long addr, unsigned long size)
+{
+	unsigned long cursor, end;
+
+	kmemleak_free_part(__va(addr), size);
+
+	cursor = PFN_UP(addr);
+	end = PFN_DOWN(addr + size);
+
+	for (; cursor < end; cursor++) {
+		__free_pages_bootmem(pfn_to_page(cursor), 0);
+		totalram_pages++;
+	}
+}
+
 static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 {
 	int aligned;
-- 
cgit v1.2.3


From 5740afdb68abadc473fd5392df733558a58c1254 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 10 Nov 2009 19:46:18 +0900
Subject: swiotlb: Add swiotlb_free() function

swiotlb_free() function frees all allocated memory for swiotlb.

We need to initialize swiotlb before IOMMU initialization (x86
and powerpc needs to allocate memory from bootmem allocator). If
IOMMU initialization is successful, we need to free swiotlb
resource (don't want to waste 64MB).

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: chrisw@sous-sol.org
Cc: dwmw2@infradead.org
Cc: joerg.roedel@amd.com
Cc: muli@il.ibm.com
LKML-Reference: <1257849980-22640-8-git-send-email-fujita.tomonori@lab.ntt.co.jp>
[ -v2: build fix for the !CONFIG_SWIOTLB case ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/swiotlb.h |  6 ++++++
 lib/swiotlb.c           | 30 ++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 73b1f1cec423..59bafa690290 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -88,4 +88,10 @@ swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr);
 extern int
 swiotlb_dma_supported(struct device *hwdev, u64 mask);
 
+#ifdef CONFIG_SWIOTLB
+extern void __init swiotlb_free(void);
+#else
+static inline void swiotlb_free(void) { }
+#endif
+
 #endif /* __LINUX_SWIOTLB_H */
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index ac25cd28e807..eee512b63f17 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -97,6 +97,8 @@ static phys_addr_t *io_tlb_orig_addr;
  */
 static DEFINE_SPINLOCK(io_tlb_lock);
 
+static int late_alloc;
+
 static int __init
 setup_io_tlb_npages(char *str)
 {
@@ -262,6 +264,8 @@ swiotlb_late_init_with_default_size(size_t default_size)
 
 	swiotlb_print_info(bytes);
 
+	late_alloc = 1;
+
 	return 0;
 
 cleanup4:
@@ -281,6 +285,32 @@ cleanup1:
 	return -ENOMEM;
 }
 
+void __init swiotlb_free(void)
+{
+	if (!io_tlb_overflow_buffer)
+		return;
+
+	if (late_alloc) {
+		free_pages((unsigned long)io_tlb_overflow_buffer,
+			   get_order(io_tlb_overflow));
+		free_pages((unsigned long)io_tlb_orig_addr,
+			   get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
+		free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
+								 sizeof(int)));
+		free_pages((unsigned long)io_tlb_start,
+			   get_order(io_tlb_nslabs << IO_TLB_SHIFT));
+	} else {
+		free_bootmem_late(__pa(io_tlb_overflow_buffer),
+				  io_tlb_overflow);
+		free_bootmem_late(__pa(io_tlb_orig_addr),
+				  io_tlb_nslabs * sizeof(phys_addr_t));
+		free_bootmem_late(__pa(io_tlb_list),
+				  io_tlb_nslabs * sizeof(int));
+		free_bootmem_late(__pa(io_tlb_start),
+				  io_tlb_nslabs << IO_TLB_SHIFT);
+	}
+}
+
 static int is_swiotlb_buffer(phys_addr_t paddr)
 {
 	return paddr >= virt_to_phys(io_tlb_start) &&
-- 
cgit v1.2.3


From ad32e8cb86e7894aac51c8963eaa9f36bb8a4e14 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 10 Nov 2009 19:46:19 +0900
Subject: swiotlb: Defer swiotlb init printing, export swiotlb_print_info()

This enables us to avoid printing swiotlb memory info when we
initialize swiotlb. After swiotlb initialization, we could find
that we don't need swiotlb.

This patch removes the code to print swiotlb memory info in
swiotlb_init() and exports the function to do that.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: chrisw@sous-sol.org
Cc: dwmw2@infradead.org
Cc: joerg.roedel@amd.com
Cc: muli@il.ibm.com
Cc: tony.luck@intel.com
Cc: benh@kernel.crashing.org
LKML-Reference: <1257849980-22640-9-git-send-email-fujita.tomonori@lab.ntt.co.jp>
[ -v2: merge up conflict ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/ia64/kernel/pci-swiotlb.c |  4 ++--
 arch/powerpc/kernel/setup_32.c |  2 +-
 arch/powerpc/kernel/setup_64.c |  2 +-
 arch/x86/kernel/pci-swiotlb.c  |  3 +--
 include/linux/swiotlb.h        |  4 ++--
 lib/swiotlb.c                  | 15 ++++++++-------
 6 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c
index 285aae8431c6..53292abf846c 100644
--- a/arch/ia64/kernel/pci-swiotlb.c
+++ b/arch/ia64/kernel/pci-swiotlb.c
@@ -41,7 +41,7 @@ struct dma_map_ops swiotlb_dma_ops = {
 void __init swiotlb_dma_init(void)
 {
 	dma_ops = &swiotlb_dma_ops;
-	swiotlb_init();
+	swiotlb_init(1);
 }
 
 void __init pci_swiotlb_init(void)
@@ -51,7 +51,7 @@ void __init pci_swiotlb_init(void)
 		swiotlb = 1;
 		printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n");
 		machvec_init("dig");
-		swiotlb_init();
+		swiotlb_init(1);
 		dma_ops = &swiotlb_dma_ops;
 #else
 		panic("Unable to find Intel IOMMU");
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index 53bcf3d792db..b152de3e64d4 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -345,7 +345,7 @@ void __init setup_arch(char **cmdline_p)
 
 #ifdef CONFIG_SWIOTLB
 	if (ppc_swiotlb_enable)
-		swiotlb_init();
+		swiotlb_init(1);
 #endif
 
 	paging_init();
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 04f638d82fb3..df2c9e932b37 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -550,7 +550,7 @@ void __init setup_arch(char **cmdline_p)
 
 #ifdef CONFIG_SWIOTLB
 	if (ppc_swiotlb_enable)
-		swiotlb_init();
+		swiotlb_init(1);
 #endif
 
 	paging_init();
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index aaa6b7839f1e..ea20ef7ca523 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -52,8 +52,7 @@ void __init pci_swiotlb_init(void)
 	if (swiotlb_force)
 		swiotlb = 1;
 	if (swiotlb) {
-		printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
-		swiotlb_init();
+		swiotlb_init(0);
 		dma_ops = &swiotlb_dma_ops;
 	}
 }
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 59bafa690290..eb9bdb4d4854 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -20,8 +20,7 @@ struct scatterlist;
  */
 #define IO_TLB_SHIFT 11
 
-extern void
-swiotlb_init(void);
+extern void swiotlb_init(int verbose);
 
 extern void
 *swiotlb_alloc_coherent(struct device *hwdev, size_t size,
@@ -94,4 +93,5 @@ extern void __init swiotlb_free(void);
 static inline void swiotlb_free(void) { }
 #endif
 
+extern void swiotlb_print_info(void);
 #endif /* __LINUX_SWIOTLB_H */
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index eee512b63f17..0c12d7cce300 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -123,8 +123,9 @@ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
 	return phys_to_dma(hwdev, virt_to_phys(address));
 }
 
-static void swiotlb_print_info(unsigned long bytes)
+void swiotlb_print_info(void)
 {
+	unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
 	phys_addr_t pstart, pend;
 
 	pstart = virt_to_phys(io_tlb_start);
@@ -142,7 +143,7 @@ static void swiotlb_print_info(unsigned long bytes)
  * structures for the software IO TLB used to implement the DMA API.
  */
 void __init
-swiotlb_init_with_default_size(size_t default_size)
+swiotlb_init_with_default_size(size_t default_size, int verbose)
 {
 	unsigned long i, bytes;
 
@@ -178,14 +179,14 @@ swiotlb_init_with_default_size(size_t default_size)
 	io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
 	if (!io_tlb_overflow_buffer)
 		panic("Cannot allocate SWIOTLB overflow buffer!\n");
-
-	swiotlb_print_info(bytes);
+	if (verbose)
+		swiotlb_print_info();
 }
 
 void __init
-swiotlb_init(void)
+swiotlb_init(int verbose)
 {
-	swiotlb_init_with_default_size(64 * (1<<20));	/* default to 64MB */
+	swiotlb_init_with_default_size(64 * (1<<20), verbose);	/* default to 64MB */
 }
 
 /*
@@ -262,7 +263,7 @@ swiotlb_late_init_with_default_size(size_t default_size)
 	if (!io_tlb_overflow_buffer)
 		goto cleanup4;
 
-	swiotlb_print_info(bytes);
+	swiotlb_print_info();
 
 	late_alloc = 1;
 
-- 
cgit v1.2.3


From cfd5324e699a2e74a44642d43dcf03d581f2a7db Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@nokia.com>
Date: Wed, 4 Nov 2009 09:58:17 +0200
Subject: MFD: TWL4030: Add audio_mclk to the codec platform data

Add audio_mclk to the platform data struct for the
twl4030-codec MFD driver.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@nokia.com>
Acked-by: Samuel Ortiz <sameo@linux.intel.com>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 include/linux/i2c/twl4030.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/i2c/twl4030.h b/include/linux/i2c/twl4030.h
index 42d6c722bd82..c188961efbce 100644
--- a/include/linux/i2c/twl4030.h
+++ b/include/linux/i2c/twl4030.h
@@ -414,6 +414,7 @@ struct twl4030_codec_vibra_data {
 };
 
 struct twl4030_codec_data {
+	unsigned int	audio_mclk;
 	struct twl4030_codec_audio_data		*audio;
 	struct twl4030_codec_vibra_data		*vibra;
 };
-- 
cgit v1.2.3


From f9b4639e045c750e2bad37462476403995508350 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@nokia.com>
Date: Wed, 4 Nov 2009 09:58:19 +0200
Subject: MFD: twl4030-codec: APLL_INFREQ handling in the MFD driver

Configure the APLL_INFREQ field in the APLL_CTL register
based on the platform data.
Provide also a function for childs to query the audio_mclk
frequency.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@nokia.com>
Acked-by: Samuel Ortiz <sameo@linux.intel.com>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/mfd/twl4030-codec.c       | 35 +++++++++++++++++++++++++++++++++++
 include/linux/mfd/twl4030-codec.h |  1 +
 2 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/twl4030-codec.c b/drivers/mfd/twl4030-codec.c
index 97103078dc2a..77b914907d7c 100644
--- a/drivers/mfd/twl4030-codec.c
+++ b/drivers/mfd/twl4030-codec.c
@@ -41,6 +41,7 @@ struct twl4030_codec_resource {
 };
 
 struct twl4030_codec {
+	unsigned int audio_mclk;
 	struct mutex mutex;
 	struct twl4030_codec_resource resource[TWL4030_CODEC_RES_MAX];
 	struct mfd_cell cells[TWL4030_CODEC_CELLS];
@@ -145,12 +146,45 @@ int twl4030_codec_disable_resource(unsigned id)
 }
 EXPORT_SYMBOL_GPL(twl4030_codec_disable_resource);
 
+unsigned int twl4030_codec_get_mclk(void)
+{
+	struct twl4030_codec *codec = platform_get_drvdata(twl4030_codec_dev);
+
+	return codec->audio_mclk;
+}
+EXPORT_SYMBOL_GPL(twl4030_codec_get_mclk);
+
 static int __devinit twl4030_codec_probe(struct platform_device *pdev)
 {
 	struct twl4030_codec *codec;
 	struct twl4030_codec_data *pdata = pdev->dev.platform_data;
 	struct mfd_cell *cell = NULL;
 	int ret, childs = 0;
+	u8 val;
+
+	if (!pdata) {
+		dev_err(&pdev->dev, "Platform data is missing\n");
+		return -EINVAL;
+	}
+
+	/* Configure APLL_INFREQ and disable APLL if enabled */
+	val = 0;
+	switch (pdata->audio_mclk) {
+	case 19200000:
+		val |= TWL4030_APLL_INFREQ_19200KHZ;
+		break;
+	case 26000000:
+		val |= TWL4030_APLL_INFREQ_26000KHZ;
+		break;
+	case 38400000:
+		val |= TWL4030_APLL_INFREQ_38400KHZ;
+		break;
+	default:
+		dev_err(&pdev->dev, "Invalid audio_mclk\n");
+		return -EINVAL;
+	}
+	twl4030_i2c_write_u8(TWL4030_MODULE_AUDIO_VOICE,
+					val, TWL4030_REG_APLL_CTL);
 
 	codec = kzalloc(sizeof(struct twl4030_codec), GFP_KERNEL);
 	if (!codec)
@@ -160,6 +194,7 @@ static int __devinit twl4030_codec_probe(struct platform_device *pdev)
 
 	twl4030_codec_dev = pdev;
 	mutex_init(&codec->mutex);
+	codec->audio_mclk = pdata->audio_mclk;
 
 	/* Codec power */
 	codec->resource[TWL4030_CODEC_RES_POWER].reg = TWL4030_REG_CODEC_MODE;
diff --git a/include/linux/mfd/twl4030-codec.h b/include/linux/mfd/twl4030-codec.h
index ef0a3041d759..2ec317c68e59 100644
--- a/include/linux/mfd/twl4030-codec.h
+++ b/include/linux/mfd/twl4030-codec.h
@@ -267,5 +267,6 @@ enum twl4030_codec_res {
 
 int twl4030_codec_disable_resource(enum twl4030_codec_res id);
 int twl4030_codec_enable_resource(enum twl4030_codec_res id);
+unsigned int twl4030_codec_get_mclk(void);
 
 #endif	/* End of __TWL4030_CODEC_H__ */
-- 
cgit v1.2.3


From 30fff9231fad757c061285e347b33c5149c2c2e4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 9 Nov 2009 05:26:33 +0000
Subject: udp: bind() optimisation

UDP bind() can be O(N^2) in some pathological cases.

Thanks to secondary hash tables, we can make it O(N)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h |  6 +++++
 include/net/udp.h   |  3 ++-
 net/ipv4/udp.c      | 73 +++++++++++++++++++++++++++++++++++++++++++++++------
 net/ipv6/udp.c      | 14 +++++-----
 4 files changed, 80 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 59f0ddf2d284..03f72a2ba028 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -88,6 +88,12 @@ static inline struct udp_sock *udp_sk(const struct sock *sk)
 	return (struct udp_sock *)sk;
 }
 
+#define udp_portaddr_for_each_entry(__sk, node, list) \
+	hlist_nulls_for_each_entry(__sk, node, list, __sk_common.skc_portaddr_node)
+
+#define udp_portaddr_for_each_entry_rcu(__sk, node, list) \
+	hlist_nulls_for_each_entry_rcu(__sk, node, list, __sk_common.skc_portaddr_node)
+
 #define IS_UDPLITE(__sk) (udp_sk(__sk)->pcflag)
 
 #endif
diff --git a/include/net/udp.h b/include/net/udp.h
index af41850f742a..5348d80b25bb 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -158,7 +158,8 @@ static inline void udp_lib_close(struct sock *sk, long timeout)
 }
 
 extern int	udp_lib_get_port(struct sock *sk, unsigned short snum,
-		int (*)(const struct sock*,const struct sock*));
+		int (*)(const struct sock *,const struct sock *),
+		unsigned int hash2_nulladdr);
 
 /* net/ipv4/udp.c */
 extern int	udp_get_port(struct sock *sk, unsigned short snum,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index d73e9170536b..1eaf57567ebf 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -152,16 +152,49 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
 	return 0;
 }
 
+/*
+ * Note: we still hold spinlock of primary hash chain, so no other writer
+ * can insert/delete a socket with local_port == num
+ */
+static int udp_lib_lport_inuse2(struct net *net, __u16 num,
+			       struct udp_hslot *hslot2,
+			       struct sock *sk,
+			       int (*saddr_comp)(const struct sock *sk1,
+						 const struct sock *sk2))
+{
+	struct sock *sk2;
+	struct hlist_nulls_node *node;
+	int res = 0;
+
+	spin_lock(&hslot2->lock);
+	udp_portaddr_for_each_entry(sk2, node, &hslot2->head)
+		if (net_eq(sock_net(sk2), net)			&&
+		    sk2 != sk					&&
+		    (udp_sk(sk2)->udp_port_hash == num)		&&
+		    (!sk2->sk_reuse || !sk->sk_reuse)		&&
+		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if
+			|| sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+		    (*saddr_comp)(sk, sk2)) {
+			res = 1;
+			break;
+		}
+	spin_unlock(&hslot2->lock);
+	return res;
+}
+
 /**
  *  udp_lib_get_port  -  UDP/-Lite port lookup for IPv4 and IPv6
  *
  *  @sk:          socket struct in question
  *  @snum:        port number to look up
  *  @saddr_comp:  AF-dependent comparison of bound local IP addresses
+ *  @hash2_nulladdr: AF-dependant hash value in secondary hash chains,
+ *                   with NULL address
  */
 int udp_lib_get_port(struct sock *sk, unsigned short snum,
 		       int (*saddr_comp)(const struct sock *sk1,
-					 const struct sock *sk2))
+					 const struct sock *sk2),
+		     unsigned int hash2_nulladdr)
 {
 	struct udp_hslot *hslot, *hslot2;
 	struct udp_table *udptable = sk->sk_prot->h.udp_table;
@@ -210,6 +243,30 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 	} else {
 		hslot = udp_hashslot(udptable, net, snum);
 		spin_lock_bh(&hslot->lock);
+		if (hslot->count > 10) {
+			int exist;
+			unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum;
+
+			slot2          &= udptable->mask;
+			hash2_nulladdr &= udptable->mask;
+
+			hslot2 = udp_hashslot2(udptable, slot2);
+			if (hslot->count < hslot2->count)
+				goto scan_primary_hash;
+
+			exist = udp_lib_lport_inuse2(net, snum, hslot2,
+						     sk, saddr_comp);
+			if (!exist && (hash2_nulladdr != slot2)) {
+				hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
+				exist = udp_lib_lport_inuse2(net, snum, hslot2,
+							     sk, saddr_comp);
+			}
+			if (exist)
+				goto fail_unlock;
+			else
+				goto found;
+		}
+scan_primary_hash:
 		if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk,
 					saddr_comp, 0))
 			goto fail_unlock;
@@ -255,12 +312,14 @@ static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr,
 
 int udp_v4_get_port(struct sock *sk, unsigned short snum)
 {
+	unsigned int hash2_nulladdr =
+		udp4_portaddr_hash(sock_net(sk), INADDR_ANY, snum);
+	unsigned int hash2_partial =
+		udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
+
 	/* precompute partial secondary hash */
-	udp_sk(sk)->udp_portaddr_hash =
-		udp4_portaddr_hash(sock_net(sk),
-				   inet_sk(sk)->inet_rcv_saddr,
-				   0);
-	return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal);
+	udp_sk(sk)->udp_portaddr_hash = hash2_partial;
+	return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr);
 }
 
 static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
@@ -336,8 +395,6 @@ static inline int compute_score2(struct sock *sk, struct net *net,
 	return score;
 }
 
-#define udp_portaddr_for_each_entry_rcu(__sk, node, list) \
-	hlist_nulls_for_each_entry_rcu(__sk, node, list, __sk_common.skc_portaddr_node)
 
 /* called with read_rcu_lock() */
 static struct sock *udp4_lib_lookup2(struct net *net,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 2915e1dad726..f4c85b200051 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -100,12 +100,14 @@ static unsigned int udp6_portaddr_hash(struct net *net,
 
 int udp_v6_get_port(struct sock *sk, unsigned short snum)
 {
+	unsigned int hash2_nulladdr =
+		udp6_portaddr_hash(sock_net(sk), &in6addr_any, snum);
+	unsigned int hash2_partial = 
+		udp6_portaddr_hash(sock_net(sk), &inet6_sk(sk)->rcv_saddr, 0);
+
 	/* precompute partial secondary hash */
-	udp_sk(sk)->udp_portaddr_hash =
-		udp6_portaddr_hash(sock_net(sk),
-				   &inet6_sk(sk)->rcv_saddr,
-				   0);
-	return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal);
+	udp_sk(sk)->udp_portaddr_hash = hash2_partial;
+	return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal, hash2_nulladdr);
 }
 
 static inline int compute_score(struct sock *sk, struct net *net,
@@ -181,8 +183,6 @@ static inline int compute_score2(struct sock *sk, struct net *net,
 	return score;
 }
 
-#define udp_portaddr_for_each_entry_rcu(__sk, node, list) \
-	hlist_nulls_for_each_entry_rcu(__sk, node, list, __sk_common.skc_portaddr_node)
 
 /* called with read_rcu_lock() */
 static struct sock *udp6_lib_lookup2(struct net *net,
-- 
cgit v1.2.3


From 37e8273cd30592d3a82bcb70cbb1bdc4eaeb6b71 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Wed, 4 Nov 2009 15:29:52 +0000
Subject: usbnet: Set link down initially for drivers that update link state

Some usbnet drivers update link state while others do not due to
hardware limitations.  Add a flag to distinguish those that do, and
set the link down initially for their devices.

This is intended to fix this bug: http://bugs.debian.org/444043

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/asix.c      | 12 ++++++------
 drivers/net/usb/cdc_ether.c |  2 +-
 drivers/net/usb/dm9601.c    |  2 +-
 drivers/net/usb/usbnet.c    |  4 +++-
 include/linux/usb/usbnet.h  |  1 +
 5 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/asix.c b/drivers/net/usb/asix.c
index 6ce7f775bb74..1bef39a60a62 100644
--- a/drivers/net/usb/asix.c
+++ b/drivers/net/usb/asix.c
@@ -1327,7 +1327,7 @@ static const struct driver_info ax8817x_info = {
 	.status = asix_status,
 	.link_reset = ax88172_link_reset,
 	.reset = ax88172_link_reset,
-	.flags =  FLAG_ETHER,
+	.flags =  FLAG_ETHER | FLAG_LINK_INTR,
 	.data = 0x00130103,
 };
 
@@ -1337,7 +1337,7 @@ static const struct driver_info dlink_dub_e100_info = {
 	.status = asix_status,
 	.link_reset = ax88172_link_reset,
 	.reset = ax88172_link_reset,
-	.flags =  FLAG_ETHER,
+	.flags =  FLAG_ETHER | FLAG_LINK_INTR,
 	.data = 0x009f9d9f,
 };
 
@@ -1347,7 +1347,7 @@ static const struct driver_info netgear_fa120_info = {
 	.status = asix_status,
 	.link_reset = ax88172_link_reset,
 	.reset = ax88172_link_reset,
-	.flags =  FLAG_ETHER,
+	.flags =  FLAG_ETHER | FLAG_LINK_INTR,
 	.data = 0x00130103,
 };
 
@@ -1357,7 +1357,7 @@ static const struct driver_info hawking_uf200_info = {
 	.status = asix_status,
 	.link_reset = ax88172_link_reset,
 	.reset = ax88172_link_reset,
-	.flags =  FLAG_ETHER,
+	.flags =  FLAG_ETHER | FLAG_LINK_INTR,
 	.data = 0x001f1d1f,
 };
 
@@ -1367,7 +1367,7 @@ static const struct driver_info ax88772_info = {
 	.status = asix_status,
 	.link_reset = ax88772_link_reset,
 	.reset = ax88772_link_reset,
-	.flags = FLAG_ETHER | FLAG_FRAMING_AX,
+	.flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_LINK_INTR,
 	.rx_fixup = asix_rx_fixup,
 	.tx_fixup = asix_tx_fixup,
 };
@@ -1378,7 +1378,7 @@ static const struct driver_info ax88178_info = {
 	.status = asix_status,
 	.link_reset = ax88178_link_reset,
 	.reset = ax88178_link_reset,
-	.flags = FLAG_ETHER | FLAG_FRAMING_AX,
+	.flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_LINK_INTR,
 	.rx_fixup = asix_rx_fixup,
 	.tx_fixup = asix_tx_fixup,
 };
diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c
index 71d7ff3de99f..7ec24c9b2535 100644
--- a/drivers/net/usb/cdc_ether.c
+++ b/drivers/net/usb/cdc_ether.c
@@ -413,7 +413,7 @@ static int cdc_bind(struct usbnet *dev, struct usb_interface *intf)
 
 static const struct driver_info	cdc_info = {
 	.description =	"CDC Ethernet Device",
-	.flags =	FLAG_ETHER,
+	.flags =	FLAG_ETHER | FLAG_LINK_INTR,
 	// .check_connect = cdc_check_connect,
 	.bind =		cdc_bind,
 	.unbind =	usbnet_cdc_unbind,
diff --git a/drivers/net/usb/dm9601.c b/drivers/net/usb/dm9601.c
index a2b30a10064f..3d406f9b2f29 100644
--- a/drivers/net/usb/dm9601.c
+++ b/drivers/net/usb/dm9601.c
@@ -611,7 +611,7 @@ static int dm9601_link_reset(struct usbnet *dev)
 
 static const struct driver_info dm9601_info = {
 	.description	= "Davicom DM9601 USB Ethernet",
-	.flags		= FLAG_ETHER,
+	.flags		= FLAG_ETHER | FLAG_LINK_INTR,
 	.bind		= dm9601_bind,
 	.rx_fixup	= dm9601_rx_fixup,
 	.tx_fixup	= dm9601_tx_fixup,
diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 378da8c938fe..04f3f289e87c 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -1352,9 +1352,11 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod)
 	// ok, it's ready to go.
 	usb_set_intfdata (udev, dev);
 
-	// start as if the link is up
 	netif_device_attach (net);
 
+	if (dev->driver_info->flags & FLAG_LINK_INTR)
+		netif_carrier_off(net);
+
 	return 0;
 
 out3:
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index 86c31b753266..8c84881f8478 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -92,6 +92,7 @@ struct driver_info {
 #define FLAG_SEND_ZLP	0x0200		/* hw requires ZLPs are sent */
 #define FLAG_WWAN	0x0400		/* use "wwan%d" names */
 
+#define FLAG_LINK_INTR	0x0800		/* updates link (carrier) status */
 
 	/* init device ... can sleep, or cause probe() failure */
 	int	(*bind)(struct usbnet *, struct usb_interface *);
-- 
cgit v1.2.3


From 254245d23396aca1f9100d500163d7bd6019ab6f Mon Sep 17 00:00:00 2001
From: stephen hemminger <shemminger@vyatta.com>
Date: Tue, 10 Nov 2009 07:54:47 +0000
Subject: netdev: add netdev_continue_rcu

This adds an RCU macro for continuing search, useful for some
network devices like vlan.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  2 ++
 include/linux/rculist.h   | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 465add6c43e3..083b5989cecb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1079,6 +1079,8 @@ extern rwlock_t				dev_base_lock;		/* Device list lock */
 		list_for_each_entry_safe(d, n, &(net)->dev_base_head, dev_list)
 #define for_each_netdev_continue(net, d)		\
 		list_for_each_entry_continue(d, &(net)->dev_base_head, dev_list)
+#define for_each_netdev_continue_rcu(net, d)		\
+	list_for_each_entry_continue_rcu(d, &(net)->dev_base_head, dev_list)
 #define net_device_entry(lh)	list_entry(lh, struct net_device, dev_list)
 
 static inline struct net_device *next_net_device(struct net_device *dev)
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 5710f43bbc9e..1bf0f708c4fc 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -261,6 +261,20 @@ static inline void list_splice_init_rcu(struct list_head *list,
 		prefetch((pos)->next), (pos) != (head); \
 		(pos) = rcu_dereference((pos)->next))
 
+/**
+ * list_for_each_entry_continue_rcu - continue iteration over list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Continue to iterate over list of given type, continuing after
+ * the current position.
+ */
+#define list_for_each_entry_continue_rcu(pos, head, member) 		\
+	for (pos = list_entry_rcu(pos->member.next, typeof(*pos), member); \
+	     prefetch(pos->member.next), &pos->member != (head);	\
+	     pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
+
 /**
  * hlist_del_rcu - deletes entry from hash list without re-initialization
  * @n: the element to delete from the hash list.
-- 
cgit v1.2.3


From 2315ffa0a9f789c588c7139effa7404a387d8685 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 3 Apr 2009 03:18:02 -0700
Subject: sysctl: Don't look at ctl_name and strategy in the generic code

The ctl_name and strategy fields are unused, now that sys_sysctl
is a compatibility wrapper around /proc/sys.  No longer looking
at them in the generic code is effectively what we are doing
now and provides the guarantee that during further cleanups
we can just remove references to those fields and everything
will work ok.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/proc_sysctl.c  |  4 ++--
 include/linux/sysctl.h | 17 ++---------------
 kernel/sysctl.c        | 29 +++++++----------------------
 3 files changed, 11 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index f667e8aeabdf..6ff9981f0a18 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -48,7 +48,7 @@ out:
 static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name)
 {
 	int len;
-	for ( ; p->ctl_name || p->procname; p++) {
+	for ( ; p->procname; p++) {
 
 		if (!p->procname)
 			continue;
@@ -218,7 +218,7 @@ static int scan(struct ctl_table_header *head, ctl_table *table,
 		void *dirent, filldir_t filldir)
 {
 
-	for (; table->ctl_name || table->procname; table++, (*pos)++) {
+	for (; table->procname; table++, (*pos)++) {
 		int res;
 
 		/* Can't do anything without a proc name */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 82c32b89932d..7c4aabc04673 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -1005,8 +1005,8 @@ extern ctl_handler sysctl_ms_jiffies;
 
 /*
  * Register a set of sysctl names by calling register_sysctl_table
- * with an initialised array of struct ctl_table's.  An entry with zero
- * ctl_name and NULL procname terminates the table.  table->de will be
+ * with an initialised array of struct ctl_table's.  An entry with 
+ * NULL procname terminates the table.  table->de will be
  * set up by the registration and need not be initialised in advance.
  *
  * sysctl names can be mirrored automatically under /proc/sys.  The
@@ -1019,24 +1019,11 @@ extern ctl_handler sysctl_ms_jiffies;
  * under /proc; non-leaf nodes will be represented by directories.  A
  * null procname disables /proc mirroring at this node.
  *
- * sysctl entries with a zero ctl_name will not be available through
- * the binary sysctl interface.
- *
  * sysctl(2) can automatically manage read and write requests through
  * the sysctl table.  The data and maxlen fields of the ctl_table
  * struct enable minimal validation of the values being written to be
  * performed, and the mode field allows minimal authentication.
  * 
- * More sophisticated management can be enabled by the provision of a
- * strategy routine with the table entry.  This will be called before
- * any automatic read or write of the data is performed.
- * 
- * The strategy routine may return:
- * <0: Error occurred (error is passed to user process)
- * 0:  OK - proceed with automatic read or write.
- * >0: OK - read or write has been done by the strategy routine, so 
- *     return immediately.
- * 
  * There must be a proc_handler routine for any terminal nodes
  * mirrored under /proc/sys (non-terminals are handled by a built-in
  * directory handler).  Several default handlers are available to
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f6dacc383c16..b6b6eb1abebb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1618,7 +1618,7 @@ int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
 
 static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
 {
-	for (; table->ctl_name || table->procname; table++) {
+	for (; table->procname; table++) {
 		table->parent = parent;
 		if (table->child)
 			sysctl_set_parent(table, table->child);
@@ -1650,11 +1650,11 @@ static struct ctl_table *is_branch_in(struct ctl_table *branch,
 		return NULL;
 
 	/* ... and nothing else */
-	if (branch[1].procname || branch[1].ctl_name)
+	if (branch[1].procname)
 		return NULL;
 
 	/* table should contain subdirectory with the same name */
-	for (p = table; p->procname || p->ctl_name; p++) {
+	for (p = table; p->procname; p++) {
 		if (!p->child)
 			continue;
 		if (p->procname && strcmp(p->procname, s) == 0)
@@ -1699,8 +1699,7 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
  *
  * The members of the &struct ctl_table structure are used as follows:
  *
- * ctl_name - This is the numeric sysctl value used by sysctl(2). The number
- *            must be unique within that level of sysctl
+ * ctl_name - Dead
  *
  * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
  *            enter a sysctl file
@@ -1716,7 +1715,7 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
  *
  * proc_handler - the text handler routine (described below)
  *
- * strategy - the strategy routine (described below)
+ * strategy - Dead
  *
  * de - for internal use by the sysctl routines
  *
@@ -1730,19 +1729,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
  * struct enable minimal validation of the values being written to be
  * performed, and the mode field allows minimal authentication.
  *
- * More sophisticated management can be enabled by the provision of a
- * strategy routine with the table entry.  This will be called before
- * any automatic read or write of the data is performed.
- *
- * The strategy routine may return
- *
- * < 0 - Error occurred (error is passed to user process)
- *
- * 0   - OK - proceed with automatic read or write.
- *
- * > 0 - OK - read or write has been done by the strategy routine, so
- *       return immediately.
- *
  * There must be a proc_handler routine for any terminal nodes
  * mirrored under /proc/sys (non-terminals are handled by a built-in
  * directory handler).  Several default handlers are available to
@@ -1769,13 +1755,13 @@ struct ctl_table_header *__register_sysctl_paths(
 	struct ctl_table_set *set;
 
 	/* Count the path components */
-	for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath)
+	for (npath = 0; path[npath].procname; ++npath)
 		;
 
 	/*
 	 * For each path component, allocate a 2-element ctl_table array.
 	 * The first array element will be filled with the sysctl entry
-	 * for this, the second will be the sentinel (ctl_name == 0).
+	 * for this, the second will be the sentinel (procname == 0).
 	 *
 	 * We allocate everything in one go so that we don't have to
 	 * worry about freeing additional memory in unregister_sysctl_table.
@@ -1792,7 +1778,6 @@ struct ctl_table_header *__register_sysctl_paths(
 	for (n = 0; n < npath; ++n, ++path) {
 		/* Copy the procname */
 		new->procname = path->procname;
-		new->ctl_name = path->ctl_name;
 		new->mode     = 0555;
 
 		*prevp = new;
-- 
cgit v1.2.3


From 3491707a070c1183c709516b2f876f798c7a9a84 Mon Sep 17 00:00:00 2001
From: Rui Paulo <rpaulo@gmail.com>
Date: Mon, 9 Nov 2009 23:46:39 +0000
Subject: mac80211: update meshconf IE

This updates the Mesh Configuration IE according to the latest
draft (3.03).
Notable changes include the simplified protocol IDs.

Signed-off-by: Rui Paulo <rpaulo@gmail.com>
Signed-off-by: Javier Cardona <javier@cozybit.com>
Reviewed-by: Andrey Yurovsky <andrey@cozybit.com>
Tested-by: Brian Cavagnolo <brian@cozybit.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/ieee80211.h  |  2 +-
 net/mac80211/ieee80211_i.h | 10 +++----
 net/mac80211/mesh.c        | 69 +++++++++++++++++++---------------------------
 3 files changed, 35 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 0aa831467493..50c684db33c7 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -115,7 +115,7 @@
 #define IEEE80211_MAX_SSID_LEN		32
 
 #define IEEE80211_MAX_MESH_ID_LEN	32
-#define IEEE80211_MESH_CONFIG_LEN	24
+#define IEEE80211_MESH_CONFIG_LEN	7
 
 #define IEEE80211_QOS_CTL_LEN		2
 #define IEEE80211_QOS_CTL_TID_MASK	0x000F
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 1ef767366b77..1f4f88a8f80c 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -361,15 +361,15 @@ struct ieee80211_if_mesh {
 	u8 mesh_id[IEEE80211_MAX_MESH_ID_LEN];
 	size_t mesh_id_len;
 	/* Active Path Selection Protocol Identifier */
-	u8 mesh_pp_id[4];
+	u8 mesh_pp_id;
 	/* Active Path Selection Metric Identifier */
-	u8 mesh_pm_id[4];
+	u8 mesh_pm_id;
 	/* Congestion Control Mode Identifier */
-	u8 mesh_cc_id[4];
+	u8 mesh_cc_id;
 	/* Synchronization Protocol Identifier */
-	u8 mesh_sp_id[4];
+	u8 mesh_sp_id;
 	/* Authentication Protocol Identifier */
-	u8 mesh_auth_id[4];
+	u8 mesh_auth_id;
 	/* Local mesh Destination Sequence Number */
 	u32 dsn;
 	/* Last used PREQ ID */
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 9a733890eb47..a49a3374acb1 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -15,14 +15,14 @@
 #define IEEE80211_MESH_PEER_INACTIVITY_LIMIT (1800 * HZ)
 #define IEEE80211_MESH_HOUSEKEEPING_INTERVAL (60 * HZ)
 
-#define PP_OFFSET 	1		/* Path Selection Protocol */
-#define PM_OFFSET	5		/* Path Selection Metric   */
-#define CC_OFFSET	9		/* Congestion Control Mode */
-#define SP_OFFSET	13		/* Synchronization Protocol */
-#define AUTH_OFFSET	17		/* Authentication Protocol */
-#define CAPAB_OFFSET 	22
-#define CAPAB_ACCEPT_PLINKS 0x80
-#define CAPAB_FORWARDING    0x10
+#define MESHCONF_PP_OFFSET 	0		/* Path Selection Protocol */
+#define MESHCONF_PM_OFFSET	1		/* Path Selection Metric   */
+#define MESHCONF_CC_OFFSET	2		/* Congestion Control Mode */
+#define MESHCONF_SP_OFFSET	3		/* Synchronization Protocol */
+#define MESHCONF_AUTH_OFFSET	4		/* Authentication Protocol */
+#define MESHCONF_CAPAB_OFFSET 	6
+#define MESHCONF_CAPAB_ACCEPT_PLINKS 0x01
+#define MESHCONF_CAPAB_FORWARDING    0x08
 
 #define TMR_RUNNING_HK	0
 #define TMR_RUNNING_MP	1
@@ -85,11 +85,12 @@ bool mesh_matches_local(struct ieee802_11_elems *ie, struct ieee80211_sub_if_dat
 	 */
 	if (ifmsh->mesh_id_len == ie->mesh_id_len &&
 		memcmp(ifmsh->mesh_id, ie->mesh_id, ie->mesh_id_len) == 0 &&
-		memcmp(ifmsh->mesh_pp_id, ie->mesh_config + PP_OFFSET, 4) == 0 &&
-		memcmp(ifmsh->mesh_pm_id, ie->mesh_config + PM_OFFSET, 4) == 0 &&
-		memcmp(ifmsh->mesh_cc_id, ie->mesh_config + CC_OFFSET, 4) == 0 &&
-		memcmp(ifmsh->mesh_sp_id, ie->mesh_config + SP_OFFSET, 4) == 0 &&
-		memcmp(ifmsh->mesh_auth_id, ie->mesh_config + AUTH_OFFSET, 4) == 0)
+		(ifmsh->mesh_pp_id == *(ie->mesh_config + MESHCONF_PP_OFFSET))&&
+		(ifmsh->mesh_pm_id == *(ie->mesh_config + MESHCONF_PM_OFFSET))&&
+		(ifmsh->mesh_cc_id == *(ie->mesh_config + MESHCONF_CC_OFFSET))&&
+		(ifmsh->mesh_sp_id == *(ie->mesh_config + MESHCONF_SP_OFFSET))&&
+		(ifmsh->mesh_auth_id == *(ie->mesh_config +
+		    MESHCONF_AUTH_OFFSET)))
 		return true;
 
 	return false;
@@ -102,7 +103,8 @@ bool mesh_matches_local(struct ieee802_11_elems *ie, struct ieee80211_sub_if_dat
  */
 bool mesh_peer_accepts_plinks(struct ieee802_11_elems *ie)
 {
-	return (*(ie->mesh_config + CAPAB_OFFSET) & CAPAB_ACCEPT_PLINKS) != 0;
+	return (*(ie->mesh_config + MESHCONF_CAPAB_OFFSET) &
+	    MESHCONF_CAPAB_ACCEPT_PLINKS) != 0;
 }
 
 /**
@@ -128,18 +130,11 @@ void mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata)
 
 void mesh_ids_set_default(struct ieee80211_if_mesh *sta)
 {
-	u8 oui[3] = {0x00, 0x0F, 0xAC};
-
-	memcpy(sta->mesh_pp_id, oui, sizeof(oui));
-	memcpy(sta->mesh_pm_id, oui, sizeof(oui));
-	memcpy(sta->mesh_cc_id, oui, sizeof(oui));
-	memcpy(sta->mesh_sp_id, oui, sizeof(oui));
-	memcpy(sta->mesh_auth_id, oui, sizeof(oui));
-	sta->mesh_pp_id[sizeof(oui)] = 0;
-	sta->mesh_pm_id[sizeof(oui)] = 0;
-	sta->mesh_cc_id[sizeof(oui)] = 0xff;
-	sta->mesh_sp_id[sizeof(oui)] = 0xff;
-	sta->mesh_auth_id[sizeof(oui)] = 0x0;
+	sta->mesh_pp_id = 0;	/* HWMP */
+	sta->mesh_pm_id = 0;	/* Airtime */
+	sta->mesh_cc_id = 0;	/* Disabled */
+	sta->mesh_sp_id = 0;	/* Neighbor Offset */
+	sta->mesh_auth_id = 0;	/* Disabled */
 }
 
 int mesh_rmc_init(struct ieee80211_sub_if_data *sdata)
@@ -260,28 +255,21 @@ void mesh_mgmt_ies_add(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata)
 	pos = skb_put(skb, 2 + IEEE80211_MESH_CONFIG_LEN);
 	*pos++ = WLAN_EID_MESH_CONFIG;
 	*pos++ = IEEE80211_MESH_CONFIG_LEN;
-	/* Version */
-	*pos++ = 1;
 
 	/* Active path selection protocol ID */
-	memcpy(pos, sdata->u.mesh.mesh_pp_id, 4);
-	pos += 4;
+	*pos++ = sdata->u.mesh.mesh_pp_id;
 
 	/* Active path selection metric ID   */
-	memcpy(pos, sdata->u.mesh.mesh_pm_id, 4);
-	pos += 4;
+	*pos++ = sdata->u.mesh.mesh_pm_id;
 
 	/* Congestion control mode identifier */
-	memcpy(pos, sdata->u.mesh.mesh_cc_id, 4);
-	pos += 4;
+	*pos++ = sdata->u.mesh.mesh_cc_id;
 
 	/* Synchronization protocol identifier */
-	memcpy(pos, sdata->u.mesh.mesh_sp_id, 4);
-	pos += 4;
+	*pos++ = sdata->u.mesh.mesh_sp_id;
 
 	/* Authentication Protocol identifier */
-	memcpy(pos, sdata->u.mesh.mesh_auth_id, 4);
-	pos += 4;
+	*pos++ = sdata->u.mesh.mesh_auth_id;
 
 	/* Mesh Formation Info */
 	memset(pos, 0x00, 1);
@@ -289,8 +277,9 @@ void mesh_mgmt_ies_add(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata)
 
 	/* Mesh capability */
 	sdata->u.mesh.accepting_plinks = mesh_plink_availables(sdata);
-	*pos = CAPAB_FORWARDING;
-	*pos++ |= sdata->u.mesh.accepting_plinks ? CAPAB_ACCEPT_PLINKS : 0x00;
+	*pos = MESHCONF_CAPAB_FORWARDING;
+	*pos++ |= sdata->u.mesh.accepting_plinks ?
+	    MESHCONF_CAPAB_ACCEPT_PLINKS : 0x00;
 	*pos++ = 0x00;
 
 	return;
-- 
cgit v1.2.3


From 8b787643ca0a5130c647109d77fe512f89cfa611 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Tue, 10 Nov 2009 18:53:10 +0100
Subject: nl80211: add a parameter for using 4-address frames on virtual
 interfaces

Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h |  4 ++++
 include/net/cfg80211.h  |  2 ++
 net/wireless/nl80211.c  | 11 +++++++++++
 3 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 50afca3dcff1..203adef972cf 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -584,6 +584,8 @@ enum nl80211_commands {
  *	changed then the list changed and the dump should be repeated
  *	completely from scratch.
  *
+ * @NL80211_ATTR_4ADDR: Use 4-address frames on a virtual interface
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -714,6 +716,8 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_PID,
 
+	NL80211_ATTR_4ADDR,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index ff67865de231..1ee41e4a92e2 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -206,10 +206,12 @@ struct ieee80211_supported_band {
  * struct vif_params - describes virtual interface parameters
  * @mesh_id: mesh ID to use
  * @mesh_id_len: length of the mesh ID
+ * @use_4addr: use 4-address frames
  */
 struct vif_params {
        u8 *mesh_id;
        int mesh_id_len;
+       int use_4addr;
 };
 
 /**
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 8ed62b6c172b..8c8e4eae6a17 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -138,6 +138,7 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = {
 	[NL80211_ATTR_CIPHER_SUITE_GROUP] = { .type = NLA_U32 },
 	[NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 },
 	[NL80211_ATTR_PID] = { .type = NLA_U32 },
+	[NL80211_ATTR_4ADDR] = { .type = NLA_U8 },
 };
 
 /* policy for the attributes */
@@ -987,6 +988,13 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 		change = true;
 	}
 
+	if (info->attrs[NL80211_ATTR_4ADDR]) {
+		params.use_4addr = !!nla_get_u8(info->attrs[NL80211_ATTR_4ADDR]);
+		change = true;
+	} else {
+		params.use_4addr = -1;
+	}
+
 	if (info->attrs[NL80211_ATTR_MNTR_FLAGS]) {
 		if (ntype != NL80211_IFTYPE_MONITOR) {
 			err = -EINVAL;
@@ -1053,6 +1061,9 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
 		params.mesh_id_len = nla_len(info->attrs[NL80211_ATTR_MESH_ID]);
 	}
 
+	if (info->attrs[NL80211_ATTR_4ADDR])
+		params.use_4addr = !!nla_get_u8(info->attrs[NL80211_ATTR_4ADDR]);
+
 	err = parse_monitor_flags(type == NL80211_IFTYPE_MONITOR ?
 				  info->attrs[NL80211_ATTR_MNTR_FLAGS] : NULL,
 				  &flags);
-- 
cgit v1.2.3


From 4739a9748e1bd7459f22f7e94e7d85710ca83954 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 3 Apr 2009 05:36:01 -0700
Subject: sysctl: Remove the last of the generic binary sysctl support

Now that all of the users stopped using ctl_name and strategy it
is safe to remove the fields from struct ctl_table, and it is safe
to remove the stub strategy routines as well.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/sysctl.h |  3 ---
 kernel/sysctl.c        | 44 --------------------------------------------
 2 files changed, 47 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 7c4aabc04673..4e40442777cf 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -1033,7 +1033,6 @@ extern ctl_handler sysctl_ms_jiffies;
 /* A sysctl table is an array of struct ctl_table: */
 struct ctl_table 
 {
-	int ctl_name;			/* Binary ID */
 	const char *procname;		/* Text ID for /proc/sys, or zero */
 	void *data;
 	int maxlen;
@@ -1041,7 +1040,6 @@ struct ctl_table
 	struct ctl_table *child;
 	struct ctl_table *parent;	/* Automatically set */
 	proc_handler *proc_handler;	/* Callback for text formatting */
-	ctl_handler *strategy;		/* Callback function for all r/w */
 	void *extra1;
 	void *extra2;
 };
@@ -1075,7 +1073,6 @@ struct ctl_table_header
 /* struct ctl_path describes where in the hierarchy a table is added */
 struct ctl_path {
 	const char *procname;
-	int ctl_name;
 };
 
 void register_sysctl_root(struct ctl_table_root *root);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b6b6eb1abebb..b4a5763d6dc8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1699,8 +1699,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
  *
  * The members of the &struct ctl_table structure are used as follows:
  *
- * ctl_name - Dead
- *
  * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
  *            enter a sysctl file
  *
@@ -1715,8 +1713,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
  *
  * proc_handler - the text handler routine (described below)
  *
- * strategy - Dead
- *
  * de - for internal use by the sysctl routines
  *
  * extra1, extra2 - extra pointers usable by the proc handler routines
@@ -2639,41 +2635,6 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
 
 #endif /* CONFIG_PROC_FS */
 
-int sysctl_data(struct ctl_table *table,
-		  void __user *oldval, size_t __user *oldlenp,
-		  void __user *newval, size_t newlen)
-{
-	return -ENOSYS;
-}
-
-int sysctl_string(struct ctl_table *table,
-		  void __user *oldval, size_t __user *oldlenp,
-		  void __user *newval, size_t newlen)
-{
-	return -ENOSYS;
-}
-
-int sysctl_intvec(struct ctl_table *table,
-		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen)
-{
-	return -ENOSYS;
-}
-
-int sysctl_jiffies(struct ctl_table *table,
-		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen)
-{
-	return -ENOSYS;
-}
-
-int sysctl_ms_jiffies(struct ctl_table *table,
-		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen)
-{
-	return -ENOSYS;
-}
-
 /*
  * No sense putting this after each symbol definition, twice,
  * exception granted :-)
@@ -2688,9 +2649,4 @@ EXPORT_SYMBOL(proc_doulongvec_minmax);
 EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
 EXPORT_SYMBOL(register_sysctl_table);
 EXPORT_SYMBOL(register_sysctl_paths);
-EXPORT_SYMBOL(sysctl_intvec);
-EXPORT_SYMBOL(sysctl_jiffies);
-EXPORT_SYMBOL(sysctl_ms_jiffies);
-EXPORT_SYMBOL(sysctl_string);
-EXPORT_SYMBOL(sysctl_data);
 EXPORT_SYMBOL(unregister_sysctl_table);
-- 
cgit v1.2.3


From 6beb000923882f6204ea2cfcd932e568e900803f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Nov 2009 15:21:34 +0000
Subject: locking: Make inlining decision Kconfig based

commit 892a7c67 (locking: Allow arch-inlined spinlocks) implements the
selection of which lock functions are inlined based on defines in
arch/.../spinlock.h: #define __always_inline__LOCK_FUNCTION

Despite of the name __always_inline__* the lock functions can be built
out of line depending on config options. Also if the arch does not set
some inline defines the generic code might set them; again depending on
config options.

This makes it unnecessary hard to figure out when and which lock
functions are inlined. Aside of that it makes it way harder and
messier for -rt to manipulate the lock functions.

Convert the inlining decision to CONFIG switches. Each lock function
is inlined depending on CONFIG_INLINE_*. The configs implement the
existing dependencies. The architecture code can select ARCH_INLINE_*
to signal that it wants the corresponding lock function inlined.
ARCH_INLINE_* is necessary as Kconfig ignores "depends on"
restrictions when a config element is selected.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <20091109151428.504477141@linutronix.de>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 arch/s390/Kconfig                |  28 ++++++
 arch/s390/include/asm/spinlock.h |  29 ------
 include/linux/spinlock_api_smp.h |  75 ++++++---------
 init/Kconfig                     |   1 +
 kernel/Kconfig.locks             | 199 +++++++++++++++++++++++++++++++++++++++
 kernel/spinlock.c                |  56 +++++------
 6 files changed, 284 insertions(+), 104 deletions(-)
 create mode 100644 kernel/Kconfig.locks

(limited to 'include/linux')

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 43c0acad7160..16c673096a22 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -95,6 +95,34 @@ config S390
 	select HAVE_ARCH_TRACEHOOK
 	select INIT_ALL_POSSIBLE
 	select HAVE_PERF_EVENTS
+	select ARCH_INLINE_SPIN_TRYLOCK
+	select ARCH_INLINE_SPIN_TRYLOCK_BH
+	select ARCH_INLINE_SPIN_LOCK
+	select ARCH_INLINE_SPIN_LOCK_BH
+	select ARCH_INLINE_SPIN_LOCK_IRQ
+	select ARCH_INLINE_SPIN_LOCK_IRQSAVE
+	select ARCH_INLINE_SPIN_UNLOCK
+	select ARCH_INLINE_SPIN_UNLOCK_BH
+	select ARCH_INLINE_SPIN_UNLOCK_IRQ
+	select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
+	select ARCH_INLINE_READ_TRYLOCK
+	select ARCH_INLINE_READ_LOCK
+	select ARCH_INLINE_READ_LOCK_BH
+	select ARCH_INLINE_READ_LOCK_IRQ
+	select ARCH_INLINE_READ_LOCK_IRQSAVE
+	select ARCH_INLINE_READ_UNLOCK
+	select ARCH_INLINE_READ_UNLOCK_BH
+	select ARCH_INLINE_READ_UNLOCK_IRQ
+	select ARCH_INLINE_READ_UNLOCK_IRQRESTORE
+	select ARCH_INLINE_WRITE_TRYLOCK
+	select ARCH_INLINE_WRITE_LOCK
+	select ARCH_INLINE_WRITE_LOCK_BH
+	select ARCH_INLINE_WRITE_LOCK_IRQ
+	select ARCH_INLINE_WRITE_LOCK_IRQSAVE
+	select ARCH_INLINE_WRITE_UNLOCK
+	select ARCH_INLINE_WRITE_UNLOCK_BH
+	select ARCH_INLINE_WRITE_UNLOCK_IRQ
+	select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
 
 config SCHED_OMIT_FRAME_POINTER
 	bool
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index 41ce6861174e..c9af0d19c7ab 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -191,33 +191,4 @@ static inline int __raw_write_trylock(raw_rwlock_t *rw)
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
 
-#define __always_inline__spin_lock
-#define __always_inline__read_lock
-#define __always_inline__write_lock
-#define __always_inline__spin_lock_bh
-#define __always_inline__read_lock_bh
-#define __always_inline__write_lock_bh
-#define __always_inline__spin_lock_irq
-#define __always_inline__read_lock_irq
-#define __always_inline__write_lock_irq
-#define __always_inline__spin_lock_irqsave
-#define __always_inline__read_lock_irqsave
-#define __always_inline__write_lock_irqsave
-#define __always_inline__spin_trylock
-#define __always_inline__read_trylock
-#define __always_inline__write_trylock
-#define __always_inline__spin_trylock_bh
-#define __always_inline__spin_unlock
-#define __always_inline__read_unlock
-#define __always_inline__write_unlock
-#define __always_inline__spin_unlock_bh
-#define __always_inline__read_unlock_bh
-#define __always_inline__write_unlock_bh
-#define __always_inline__spin_unlock_irq
-#define __always_inline__read_unlock_irq
-#define __always_inline__write_unlock_irq
-#define __always_inline__spin_unlock_irqrestore
-#define __always_inline__read_unlock_irqrestore
-#define __always_inline__write_unlock_irqrestore
-
 #endif /* __ASM_SPINLOCK_H */
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
index 7a7e18fc2415..8264a7f459bc 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -60,137 +60,118 @@ void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 							__releases(lock);
 
-/*
- * We inline the unlock functions in the nondebug case:
- */
-#if !defined(CONFIG_DEBUG_SPINLOCK) && !defined(CONFIG_PREEMPT)
-#define __always_inline__spin_unlock
-#define __always_inline__read_unlock
-#define __always_inline__write_unlock
-#define __always_inline__spin_unlock_irq
-#define __always_inline__read_unlock_irq
-#define __always_inline__write_unlock_irq
-#endif
-
-#ifndef CONFIG_DEBUG_SPINLOCK
-#ifndef CONFIG_GENERIC_LOCKBREAK
-
-#ifdef __always_inline__spin_lock
+#ifdef CONFIG_INLINE_SPIN_LOCK
 #define _spin_lock(lock) __spin_lock(lock)
 #endif
 
-#ifdef __always_inline__read_lock
+#ifdef CONFIG_INLINE_READ_LOCK
 #define _read_lock(lock) __read_lock(lock)
 #endif
 
-#ifdef __always_inline__write_lock
+#ifdef CONFIG_INLINE_WRITE_LOCK
 #define _write_lock(lock) __write_lock(lock)
 #endif
 
-#ifdef __always_inline__spin_lock_bh
+#ifdef CONFIG_INLINE_SPIN_LOCK_BH
 #define _spin_lock_bh(lock) __spin_lock_bh(lock)
 #endif
 
-#ifdef __always_inline__read_lock_bh
+#ifdef CONFIG_INLINE_READ_LOCK_BH
 #define _read_lock_bh(lock) __read_lock_bh(lock)
 #endif
 
-#ifdef __always_inline__write_lock_bh
+#ifdef CONFIG_INLINE_WRITE_LOCK_BH
 #define _write_lock_bh(lock) __write_lock_bh(lock)
 #endif
 
-#ifdef __always_inline__spin_lock_irq
+#ifdef CONFIG_INLINE_SPIN_LOCK_IRQ
 #define _spin_lock_irq(lock) __spin_lock_irq(lock)
 #endif
 
-#ifdef __always_inline__read_lock_irq
+#ifdef CONFIG_INLINE_READ_LOCK_IRQ
 #define _read_lock_irq(lock) __read_lock_irq(lock)
 #endif
 
-#ifdef __always_inline__write_lock_irq
+#ifdef CONFIG_INLINE_WRITE_LOCK_IRQ
 #define _write_lock_irq(lock) __write_lock_irq(lock)
 #endif
 
-#ifdef __always_inline__spin_lock_irqsave
+#ifdef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
 #define _spin_lock_irqsave(lock) __spin_lock_irqsave(lock)
 #endif
 
-#ifdef __always_inline__read_lock_irqsave
+#ifdef CONFIG_INLINE_READ_LOCK_IRQSAVE
 #define _read_lock_irqsave(lock) __read_lock_irqsave(lock)
 #endif
 
-#ifdef __always_inline__write_lock_irqsave
+#ifdef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
 #define _write_lock_irqsave(lock) __write_lock_irqsave(lock)
 #endif
 
-#endif /* !CONFIG_GENERIC_LOCKBREAK */
-
-#ifdef __always_inline__spin_trylock
+#ifdef CONFIG_INLINE_SPIN_TRYLOCK
 #define _spin_trylock(lock) __spin_trylock(lock)
 #endif
 
-#ifdef __always_inline__read_trylock
+#ifdef CONFIG_INLINE_READ_TRYLOCK
 #define _read_trylock(lock) __read_trylock(lock)
 #endif
 
-#ifdef __always_inline__write_trylock
+#ifdef CONFIG_INLINE_WRITE_TRYLOCK
 #define _write_trylock(lock) __write_trylock(lock)
 #endif
 
-#ifdef __always_inline__spin_trylock_bh
+#ifdef CONFIG_INLINE_SPIN_TRYLOCK_BH
 #define _spin_trylock_bh(lock) __spin_trylock_bh(lock)
 #endif
 
-#ifdef __always_inline__spin_unlock
+#ifdef CONFIG_INLINE_SPIN_UNLOCK
 #define _spin_unlock(lock) __spin_unlock(lock)
 #endif
 
-#ifdef __always_inline__read_unlock
+#ifdef CONFIG_INLINE_READ_UNLOCK
 #define _read_unlock(lock) __read_unlock(lock)
 #endif
 
-#ifdef __always_inline__write_unlock
+#ifdef CONFIG_INLINE_WRITE_UNLOCK
 #define _write_unlock(lock) __write_unlock(lock)
 #endif
 
-#ifdef __always_inline__spin_unlock_bh
+#ifdef CONFIG_INLINE_SPIN_UNLOCK_BH
 #define _spin_unlock_bh(lock) __spin_unlock_bh(lock)
 #endif
 
-#ifdef __always_inline__read_unlock_bh
+#ifdef CONFIG_INLINE_READ_UNLOCK_BH
 #define _read_unlock_bh(lock) __read_unlock_bh(lock)
 #endif
 
-#ifdef __always_inline__write_unlock_bh
+#ifdef CONFIG_INLINE_WRITE_UNLOCK_BH
 #define _write_unlock_bh(lock) __write_unlock_bh(lock)
 #endif
 
-#ifdef __always_inline__spin_unlock_irq
+#ifdef CONFIG_INLINE_SPIN_UNLOCK_IRQ
 #define _spin_unlock_irq(lock) __spin_unlock_irq(lock)
 #endif
 
-#ifdef __always_inline__read_unlock_irq
+#ifdef CONFIG_INLINE_READ_UNLOCK_IRQ
 #define _read_unlock_irq(lock) __read_unlock_irq(lock)
 #endif
 
-#ifdef __always_inline__write_unlock_irq
+#ifdef CONFIG_INLINE_WRITE_UNLOCK_IRQ
 #define _write_unlock_irq(lock) __write_unlock_irq(lock)
 #endif
 
-#ifdef __always_inline__spin_unlock_irqrestore
+#ifdef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
 #define _spin_unlock_irqrestore(lock, flags) __spin_unlock_irqrestore(lock, flags)
 #endif
 
-#ifdef __always_inline__read_unlock_irqrestore
+#ifdef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE
 #define _read_unlock_irqrestore(lock, flags) __read_unlock_irqrestore(lock, flags)
 #endif
 
-#ifdef __always_inline__write_unlock_irqrestore
+#ifdef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE
 #define _write_unlock_irqrestore(lock, flags) __write_unlock_irqrestore(lock, flags)
 #endif
 
-#endif /* CONFIG_DEBUG_SPINLOCK */
-
 static inline int __spin_trylock(spinlock_t *lock)
 {
 	preempt_disable();
diff --git a/init/Kconfig b/init/Kconfig
index 9e03ef8b311e..06863dd7bf49 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1210,3 +1210,4 @@ source "block/Kconfig"
 config PREEMPT_NOTIFIERS
 	bool
 
+source "kernel/Kconfig.locks"
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
new file mode 100644
index 000000000000..d1f86db71451
--- /dev/null
+++ b/kernel/Kconfig.locks
@@ -0,0 +1,199 @@
+#
+# The ARCH_INLINE foo is necessary because select ignores "depends on"
+#
+config ARCH_INLINE_SPIN_TRYLOCK
+	bool
+
+config ARCH_INLINE_SPIN_TRYLOCK_BH
+	bool
+
+config ARCH_INLINE_SPIN_LOCK
+	bool
+
+config ARCH_INLINE_SPIN_LOCK_BH
+	bool
+
+config ARCH_INLINE_SPIN_LOCK_IRQ
+	bool
+
+config ARCH_INLINE_SPIN_LOCK_IRQSAVE
+	bool
+
+config ARCH_INLINE_SPIN_UNLOCK
+	bool
+
+config ARCH_INLINE_SPIN_UNLOCK_BH
+	bool
+
+config ARCH_INLINE_SPIN_UNLOCK_IRQ
+	bool
+
+config ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
+	bool
+
+
+config ARCH_INLINE_READ_TRYLOCK
+	bool
+
+config ARCH_INLINE_READ_LOCK
+	bool
+
+config ARCH_INLINE_READ_LOCK_BH
+	bool
+
+config ARCH_INLINE_READ_LOCK_IRQ
+	bool
+
+config ARCH_INLINE_READ_LOCK_IRQSAVE
+	bool
+
+config ARCH_INLINE_READ_UNLOCK
+	bool
+
+config ARCH_INLINE_READ_UNLOCK_BH
+	bool
+
+config ARCH_INLINE_READ_UNLOCK_IRQ
+	bool
+
+config ARCH_INLINE_READ_UNLOCK_IRQRESTORE
+	bool
+
+
+config ARCH_INLINE_WRITE_TRYLOCK
+	bool
+
+config ARCH_INLINE_WRITE_LOCK
+	bool
+
+config ARCH_INLINE_WRITE_LOCK_BH
+	bool
+
+config ARCH_INLINE_WRITE_LOCK_IRQ
+	bool
+
+config ARCH_INLINE_WRITE_LOCK_IRQSAVE
+	bool
+
+config ARCH_INLINE_WRITE_UNLOCK
+	bool
+
+config ARCH_INLINE_WRITE_UNLOCK_BH
+	bool
+
+config ARCH_INLINE_WRITE_UNLOCK_IRQ
+	bool
+
+config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+	bool
+
+#
+# lock_* functions are inlined when:
+#   - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y
+#
+# trylock_* functions are inlined when:
+#   - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
+#
+# unlock and unlock_irq functions are inlined when:
+#   - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
+#  or
+#   - DEBUG_SPINLOCK=n and PREEMPT=n
+#
+# unlock_bh and unlock_irqrestore functions are inlined when:
+#   - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
+#
+
+config INLINE_SPIN_TRYLOCK
+	def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK
+
+config INLINE_SPIN_TRYLOCK_BH
+	def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH
+
+config INLINE_SPIN_LOCK
+	def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
+
+config INLINE_SPIN_LOCK_BH
+	def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+		 ARCH_INLINE_SPIN_LOCK_BH
+
+config INLINE_SPIN_LOCK_IRQ
+	def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+		 ARCH_INLINE_SPIN_LOCK_IRQ
+
+config INLINE_SPIN_LOCK_IRQSAVE
+	def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+		 ARCH_INLINE_SPIN_LOCK_IRQSAVE
+
+config INLINE_SPIN_UNLOCK
+	def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK)
+
+config INLINE_SPIN_UNLOCK_BH
+	def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH
+
+config INLINE_SPIN_UNLOCK_IRQ
+	def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH)
+
+config INLINE_SPIN_UNLOCK_IRQRESTORE
+	def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
+
+
+config INLINE_READ_TRYLOCK
+	def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK
+
+config INLINE_READ_LOCK
+	def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
+
+config INLINE_READ_LOCK_BH
+	def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+		 ARCH_INLINE_READ_LOCK_BH
+
+config INLINE_READ_LOCK_IRQ
+	def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+		 ARCH_INLINE_READ_LOCK_IRQ
+
+config INLINE_READ_LOCK_IRQSAVE
+	def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+		 ARCH_INLINE_READ_LOCK_IRQSAVE
+
+config INLINE_READ_UNLOCK
+	def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK)
+
+config INLINE_READ_UNLOCK_BH
+	def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH
+
+config INLINE_READ_UNLOCK_IRQ
+	def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH)
+
+config INLINE_READ_UNLOCK_IRQRESTORE
+	def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE
+
+
+config INLINE_WRITE_TRYLOCK
+	def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK
+
+config INLINE_WRITE_LOCK
+	def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
+
+config INLINE_WRITE_LOCK_BH
+	def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+		 ARCH_INLINE_WRITE_LOCK_BH
+
+config INLINE_WRITE_LOCK_IRQ
+	def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+		 ARCH_INLINE_WRITE_LOCK_IRQ
+
+config INLINE_WRITE_LOCK_IRQSAVE
+	def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+		 ARCH_INLINE_WRITE_LOCK_IRQSAVE
+
+config INLINE_WRITE_UNLOCK
+	def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK)
+
+config INLINE_WRITE_UNLOCK_BH
+	def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH
+
+config INLINE_WRITE_UNLOCK_IRQ
+	def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH)
+
+config INLINE_WRITE_UNLOCK_IRQRESTORE
+	def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 5ddab730cb2f..235a9579a875 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -21,7 +21,7 @@
 #include <linux/debug_locks.h>
 #include <linux/module.h>
 
-#ifndef _spin_trylock
+#ifndef CONFIG_INLINE_SPIN_TRYLOCK
 int __lockfunc _spin_trylock(spinlock_t *lock)
 {
 	return __spin_trylock(lock);
@@ -29,7 +29,7 @@ int __lockfunc _spin_trylock(spinlock_t *lock)
 EXPORT_SYMBOL(_spin_trylock);
 #endif
 
-#ifndef _read_trylock
+#ifndef CONFIG_INLINE_READ_TRYLOCK
 int __lockfunc _read_trylock(rwlock_t *lock)
 {
 	return __read_trylock(lock);
@@ -37,7 +37,7 @@ int __lockfunc _read_trylock(rwlock_t *lock)
 EXPORT_SYMBOL(_read_trylock);
 #endif
 
-#ifndef _write_trylock
+#ifndef CONFIG_INLINE_WRITE_TRYLOCK
 int __lockfunc _write_trylock(rwlock_t *lock)
 {
 	return __write_trylock(lock);
@@ -52,7 +52,7 @@ EXPORT_SYMBOL(_write_trylock);
  */
 #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
 
-#ifndef _read_lock
+#ifndef CONFIG_INLINE_READ_LOCK
 void __lockfunc _read_lock(rwlock_t *lock)
 {
 	__read_lock(lock);
@@ -60,7 +60,7 @@ void __lockfunc _read_lock(rwlock_t *lock)
 EXPORT_SYMBOL(_read_lock);
 #endif
 
-#ifndef _spin_lock_irqsave
+#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
 unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
 {
 	return __spin_lock_irqsave(lock);
@@ -68,7 +68,7 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
 EXPORT_SYMBOL(_spin_lock_irqsave);
 #endif
 
-#ifndef _spin_lock_irq
+#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ
 void __lockfunc _spin_lock_irq(spinlock_t *lock)
 {
 	__spin_lock_irq(lock);
@@ -76,7 +76,7 @@ void __lockfunc _spin_lock_irq(spinlock_t *lock)
 EXPORT_SYMBOL(_spin_lock_irq);
 #endif
 
-#ifndef _spin_lock_bh
+#ifndef CONFIG_INLINE_SPIN_LOCK_BH
 void __lockfunc _spin_lock_bh(spinlock_t *lock)
 {
 	__spin_lock_bh(lock);
@@ -84,7 +84,7 @@ void __lockfunc _spin_lock_bh(spinlock_t *lock)
 EXPORT_SYMBOL(_spin_lock_bh);
 #endif
 
-#ifndef _read_lock_irqsave
+#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE
 unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
 {
 	return __read_lock_irqsave(lock);
@@ -92,7 +92,7 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
 EXPORT_SYMBOL(_read_lock_irqsave);
 #endif
 
-#ifndef _read_lock_irq
+#ifndef CONFIG_INLINE_READ_LOCK_IRQ
 void __lockfunc _read_lock_irq(rwlock_t *lock)
 {
 	__read_lock_irq(lock);
@@ -100,7 +100,7 @@ void __lockfunc _read_lock_irq(rwlock_t *lock)
 EXPORT_SYMBOL(_read_lock_irq);
 #endif
 
-#ifndef _read_lock_bh
+#ifndef CONFIG_INLINE_READ_LOCK_BH
 void __lockfunc _read_lock_bh(rwlock_t *lock)
 {
 	__read_lock_bh(lock);
@@ -108,7 +108,7 @@ void __lockfunc _read_lock_bh(rwlock_t *lock)
 EXPORT_SYMBOL(_read_lock_bh);
 #endif
 
-#ifndef _write_lock_irqsave
+#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
 unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
 {
 	return __write_lock_irqsave(lock);
@@ -116,7 +116,7 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
 EXPORT_SYMBOL(_write_lock_irqsave);
 #endif
 
-#ifndef _write_lock_irq
+#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ
 void __lockfunc _write_lock_irq(rwlock_t *lock)
 {
 	__write_lock_irq(lock);
@@ -124,7 +124,7 @@ void __lockfunc _write_lock_irq(rwlock_t *lock)
 EXPORT_SYMBOL(_write_lock_irq);
 #endif
 
-#ifndef _write_lock_bh
+#ifndef CONFIG_INLINE_WRITE_LOCK_BH
 void __lockfunc _write_lock_bh(rwlock_t *lock)
 {
 	__write_lock_bh(lock);
@@ -132,7 +132,7 @@ void __lockfunc _write_lock_bh(rwlock_t *lock)
 EXPORT_SYMBOL(_write_lock_bh);
 #endif
 
-#ifndef _spin_lock
+#ifndef CONFIG_INLINE_SPIN_LOCK
 void __lockfunc _spin_lock(spinlock_t *lock)
 {
 	__spin_lock(lock);
@@ -140,7 +140,7 @@ void __lockfunc _spin_lock(spinlock_t *lock)
 EXPORT_SYMBOL(_spin_lock);
 #endif
 
-#ifndef _write_lock
+#ifndef CONFIG_INLINE_WRITE_LOCK
 void __lockfunc _write_lock(rwlock_t *lock)
 {
 	__write_lock(lock);
@@ -272,7 +272,7 @@ EXPORT_SYMBOL(_spin_lock_nest_lock);
 
 #endif
 
-#ifndef _spin_unlock
+#ifndef CONFIG_INLINE_SPIN_UNLOCK
 void __lockfunc _spin_unlock(spinlock_t *lock)
 {
 	__spin_unlock(lock);
@@ -280,7 +280,7 @@ void __lockfunc _spin_unlock(spinlock_t *lock)
 EXPORT_SYMBOL(_spin_unlock);
 #endif
 
-#ifndef _write_unlock
+#ifndef CONFIG_INLINE_WRITE_UNLOCK
 void __lockfunc _write_unlock(rwlock_t *lock)
 {
 	__write_unlock(lock);
@@ -288,7 +288,7 @@ void __lockfunc _write_unlock(rwlock_t *lock)
 EXPORT_SYMBOL(_write_unlock);
 #endif
 
-#ifndef _read_unlock
+#ifndef CONFIG_INLINE_READ_UNLOCK
 void __lockfunc _read_unlock(rwlock_t *lock)
 {
 	__read_unlock(lock);
@@ -296,7 +296,7 @@ void __lockfunc _read_unlock(rwlock_t *lock)
 EXPORT_SYMBOL(_read_unlock);
 #endif
 
-#ifndef _spin_unlock_irqrestore
+#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
 void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
 {
 	__spin_unlock_irqrestore(lock, flags);
@@ -304,7 +304,7 @@ void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
 EXPORT_SYMBOL(_spin_unlock_irqrestore);
 #endif
 
-#ifndef _spin_unlock_irq
+#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ
 void __lockfunc _spin_unlock_irq(spinlock_t *lock)
 {
 	__spin_unlock_irq(lock);
@@ -312,7 +312,7 @@ void __lockfunc _spin_unlock_irq(spinlock_t *lock)
 EXPORT_SYMBOL(_spin_unlock_irq);
 #endif
 
-#ifndef _spin_unlock_bh
+#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH
 void __lockfunc _spin_unlock_bh(spinlock_t *lock)
 {
 	__spin_unlock_bh(lock);
@@ -320,7 +320,7 @@ void __lockfunc _spin_unlock_bh(spinlock_t *lock)
 EXPORT_SYMBOL(_spin_unlock_bh);
 #endif
 
-#ifndef _read_unlock_irqrestore
+#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE
 void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 {
 	__read_unlock_irqrestore(lock, flags);
@@ -328,7 +328,7 @@ void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 EXPORT_SYMBOL(_read_unlock_irqrestore);
 #endif
 
-#ifndef _read_unlock_irq
+#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ
 void __lockfunc _read_unlock_irq(rwlock_t *lock)
 {
 	__read_unlock_irq(lock);
@@ -336,7 +336,7 @@ void __lockfunc _read_unlock_irq(rwlock_t *lock)
 EXPORT_SYMBOL(_read_unlock_irq);
 #endif
 
-#ifndef _read_unlock_bh
+#ifndef CONFIG_INLINE_READ_UNLOCK_BH
 void __lockfunc _read_unlock_bh(rwlock_t *lock)
 {
 	__read_unlock_bh(lock);
@@ -344,7 +344,7 @@ void __lockfunc _read_unlock_bh(rwlock_t *lock)
 EXPORT_SYMBOL(_read_unlock_bh);
 #endif
 
-#ifndef _write_unlock_irqrestore
+#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE
 void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 {
 	__write_unlock_irqrestore(lock, flags);
@@ -352,7 +352,7 @@ void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 EXPORT_SYMBOL(_write_unlock_irqrestore);
 #endif
 
-#ifndef _write_unlock_irq
+#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ
 void __lockfunc _write_unlock_irq(rwlock_t *lock)
 {
 	__write_unlock_irq(lock);
@@ -360,7 +360,7 @@ void __lockfunc _write_unlock_irq(rwlock_t *lock)
 EXPORT_SYMBOL(_write_unlock_irq);
 #endif
 
-#ifndef _write_unlock_bh
+#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH
 void __lockfunc _write_unlock_bh(rwlock_t *lock)
 {
 	__write_unlock_bh(lock);
@@ -368,7 +368,7 @@ void __lockfunc _write_unlock_bh(rwlock_t *lock)
 EXPORT_SYMBOL(_write_unlock_bh);
 #endif
 
-#ifndef _spin_trylock_bh
+#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH
 int __lockfunc _spin_trylock_bh(spinlock_t *lock)
 {
 	return __spin_trylock_bh(lock);
-- 
cgit v1.2.3


From 572a9d7b6fc7f20f573664063324c086be310c42 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 10 Nov 2009 06:14:14 +0000
Subject: net: allow to propagate errors through ->ndo_hard_start_xmit()

Currently the ->ndo_hard_start_xmit() callbacks are only permitted to return
one of the NETDEV_TX codes. This prevents any kind of error propagation for
virtual devices, like queue congestion of the underlying device in case of
layered devices, or unreachability in case of tunnels.

This patches changes the NET_XMIT codes to avoid clashes with the NETDEV_TX
codes and changes the two callers of dev_hard_start_xmit() to expect either
errno codes, NET_XMIT codes or NETDEV_TX codes as return value.

In case of qdisc_restart(), all non NETDEV_TX codes are mapped to NETDEV_TX_OK
since no error propagation is possible when using qdiscs. In case of
dev_queue_xmit(), the error is propagated upwards.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 43 ++++++++++++++++++++++++++++++++-----------
 net/core/dev.c            | 32 ++++++++++++++++++++++++++------
 net/sched/sch_generic.c   |  9 ++++++++-
 3 files changed, 66 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 083b5989cecb..8b266390b9e2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -63,27 +63,48 @@ struct wireless_dev;
 #define HAVE_FREE_NETDEV		/* free_netdev() */
 #define HAVE_NETDEV_PRIV		/* netdev_priv() */
 
-#define NET_XMIT_SUCCESS	0
-#define NET_XMIT_DROP		1	/* skb dropped			*/
-#define NET_XMIT_CN		2	/* congestion notification	*/
-#define NET_XMIT_POLICED	3	/* skb is shot by police	*/
-#define NET_XMIT_MASK		0xFFFF	/* qdisc flags in net/sch_generic.h */
+/*
+ * Transmit return codes: transmit return codes originate from three different
+ * namespaces:
+ *
+ * - qdisc return codes
+ * - driver transmit return codes
+ * - errno values
+ *
+ * Drivers are allowed to return any one of those in their hard_start_xmit()
+ * function. Real network devices commonly used with qdiscs should only return
+ * the driver transmit return codes though - when qdiscs are used, the actual
+ * transmission happens asynchronously, so the value is not propagated to
+ * higher layers. Virtual network devices transmit synchronously, in this case
+ * the driver transmit return codes are consumed by dev_queue_xmit(), all
+ * others are propagated to higher layers.
+ */
+
+/* qdisc ->enqueue() return codes. */
+#define NET_XMIT_SUCCESS	0x00
+#define NET_XMIT_DROP		0x10	/* skb dropped			*/
+#define NET_XMIT_CN		0x20	/* congestion notification	*/
+#define NET_XMIT_POLICED	0x30	/* skb is shot by police	*/
+#define NET_XMIT_MASK		0xf0	/* qdisc flags in net/sch_generic.h */
 
 /* Backlog congestion levels */
-#define NET_RX_SUCCESS		0   /* keep 'em coming, baby */
-#define NET_RX_DROP		1  /* packet dropped */
+#define NET_RX_SUCCESS		0	/* keep 'em coming, baby */
+#define NET_RX_DROP		1	/* packet dropped */
 
 /* NET_XMIT_CN is special. It does not guarantee that this packet is lost. It
  * indicates that the device will soon be dropping packets, or already drops
  * some packets of the same priority; prompting us to send less aggressively. */
-#define net_xmit_eval(e)	((e) == NET_XMIT_CN? 0 : (e))
+#define net_xmit_eval(e)	((e) == NET_XMIT_CN ? 0 : (e))
 #define net_xmit_errno(e)	((e) != NET_XMIT_CN ? -ENOBUFS : 0)
 
 /* Driver transmit return codes */
+#define NETDEV_TX_MASK		0xf
+
 enum netdev_tx {
-	NETDEV_TX_OK = 0,	/* driver took care of packet */
-	NETDEV_TX_BUSY,		/* driver tx path was busy*/
-	NETDEV_TX_LOCKED = -1,	/* driver tx lock was already taken */
+	__NETDEV_TX_MIN	 = INT_MIN,	/* make sure enum is signed */
+	NETDEV_TX_OK	 = 0,		/* driver took care of packet */
+	NETDEV_TX_BUSY	 = 1,		/* driver tx path was busy*/
+	NETDEV_TX_LOCKED = 2,		/* driver tx lock was already taken */
 };
 typedef enum netdev_tx netdev_tx_t;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index ad8e320ceba7..548340b57296 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1757,7 +1757,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			struct netdev_queue *txq)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
-	int rc;
+	int rc = NETDEV_TX_OK;
 
 	if (likely(!skb->next)) {
 		if (!list_empty(&ptype_all))
@@ -1805,6 +1805,8 @@ gso:
 		nskb->next = NULL;
 		rc = ops->ndo_start_xmit(nskb, dev);
 		if (unlikely(rc != NETDEV_TX_OK)) {
+			if (rc & ~NETDEV_TX_MASK)
+				goto out_kfree_gso_skb;
 			nskb->next = skb->next;
 			skb->next = nskb;
 			return rc;
@@ -1814,11 +1816,12 @@ gso:
 			return NETDEV_TX_BUSY;
 	} while (skb->next);
 
-	skb->destructor = DEV_GSO_CB(skb)->destructor;
-
+out_kfree_gso_skb:
+	if (likely(skb->next == NULL))
+		skb->destructor = DEV_GSO_CB(skb)->destructor;
 out_kfree_skb:
 	kfree_skb(skb);
-	return NETDEV_TX_OK;
+	return rc;
 }
 
 static u32 skb_tx_hashrnd;
@@ -1906,6 +1909,23 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 	return rc;
 }
 
+static inline bool dev_xmit_complete(int rc)
+{
+	/* successful transmission */
+	if (rc == NETDEV_TX_OK)
+		return true;
+
+	/* error while transmitting, driver consumed skb */
+	if (rc < 0)
+		return true;
+
+	/* error while queueing to a different device, driver consumed skb */
+	if (rc & NET_XMIT_MASK)
+		return true;
+
+	return false;
+}
+
 /**
  *	dev_queue_xmit - transmit a buffer
  *	@skb: buffer to transmit
@@ -2003,8 +2023,8 @@ gso:
 			HARD_TX_LOCK(dev, txq, cpu);
 
 			if (!netif_tx_queue_stopped(txq)) {
-				rc = NET_XMIT_SUCCESS;
-				if (!dev_hard_start_xmit(skb, dev, txq)) {
+				rc = dev_hard_start_xmit(skb, dev, txq);
+				if (dev_xmit_complete(rc)) {
 					HARD_TX_UNLOCK(dev, txq);
 					goto out;
 				}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 4ae6aa562f2b..b13821ad2fb6 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -120,8 +120,15 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 
 	HARD_TX_LOCK(dev, txq, smp_processor_id());
 	if (!netif_tx_queue_stopped(txq) &&
-	    !netif_tx_queue_frozen(txq))
+	    !netif_tx_queue_frozen(txq)) {
 		ret = dev_hard_start_xmit(skb, dev, txq);
+
+		/* an error implies that the skb was consumed */
+		if (ret < 0)
+			ret = NETDEV_TX_OK;
+		/* all NET_XMIT codes map to NETDEV_TX_OK */
+		ret &= ~NET_XMIT_MASK;
+	}
 	HARD_TX_UNLOCK(dev, txq);
 
 	spin_lock(root_lock);
-- 
cgit v1.2.3


From d9b263528e01bfbaf716b51f38606b3dfe5ac1e9 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg@redhat.com>
Date: Fri, 13 Nov 2009 14:57:00 -0500
Subject: x86, setup: Store the boot cursor state

Add a field to store the boot cursor state and implement this for VGA on
x86. This can then be used to set the default policy for the boot console.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
LKML-Reference: <1258142222-16092-1-git-send-email-mjg@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/video.c       | 6 ++++++
 include/linux/screen_info.h | 5 ++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index d42da3802499..f767164cd5df 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -27,6 +27,12 @@ static void store_cursor_position(void)
 
 	boot_params.screen_info.orig_x = oreg.dl;
 	boot_params.screen_info.orig_y = oreg.dh;
+
+	if (oreg.ch & 0x20)
+		boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR;
+
+	if ((oreg.ch & 0x1f) > (oreg.cl & 0x1f))
+		boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR;
 }
 
 static void store_video_mode(void)
diff --git a/include/linux/screen_info.h b/include/linux/screen_info.h
index 1ee2c05142f6..899fbb487c94 100644
--- a/include/linux/screen_info.h
+++ b/include/linux/screen_info.h
@@ -14,7 +14,8 @@ struct screen_info {
 	__u16 orig_video_page;	/* 0x04 */
 	__u8  orig_video_mode;	/* 0x06 */
 	__u8  orig_video_cols;	/* 0x07 */
-	__u16 unused2;		/* 0x08 */
+	__u8  flags;		/* 0x08 */
+	__u8  unused2;		/* 0x09 */
 	__u16 orig_video_ega_bx;/* 0x0a */
 	__u16 unused3;		/* 0x0c */
 	__u8  orig_video_lines;	/* 0x0e */
@@ -65,6 +66,8 @@ struct screen_info {
 
 #define VIDEO_TYPE_EFI		0x70	/* EFI graphic mode		*/
 
+#define VIDEO_FLAGS_NOCURSOR	(1 << 0) /* The video mode has no cursor set */
+
 #ifdef __KERNEL__
 extern struct screen_info screen_info;
 
-- 
cgit v1.2.3


From 90a5e16992fa6105f7ebf3f29f5cf5feb1bbf7dc Mon Sep 17 00:00:00 2001
From: Rui Paulo <rpaulo@gmail.com>
Date: Wed, 11 Nov 2009 00:01:31 +0000
Subject: mac80211: implement RANN processing and forwarding

Process the RANN (Root Annoucement) Frame and try to find the HWMP
root station by sending a PREQ.

Signed-off-by: Rui Paulo <rpaulo@gmail.com>
Signed-off-by: Javier Cardona <javier@cozybit.com>
Reviewed-by: Andrey Yurovsky <andrey@cozybit.com>
Tested-by: Brian Cavagnolo <brian@cozybit.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/ieee80211.h  | 15 +++++++++
 net/mac80211/ieee80211_i.h |  1 +
 net/mac80211/mesh_hwmp.c   | 79 +++++++++++++++++++++++++++++++++++++++++-----
 net/mac80211/util.c        |  4 +++
 4 files changed, 91 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 50c684db33c7..49b1abd2fe97 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -554,6 +554,20 @@ struct ieee80211_tim_ie {
 	u8 virtual_map[1];
 } __attribute__ ((packed));
 
+/**
+ * struct ieee80211_rann_ie
+ *
+ * This structure refers to "Root Announcement information element"
+ */
+struct ieee80211_rann_ie {
+	u8 rann_flags;
+	u8 rann_hopcount;
+	u8 rann_ttl;
+	u8 rann_addr[6];
+	u32 rann_seq;
+	u32 rann_metric;
+} __attribute__ ((packed));
+
 #define WLAN_SA_QUERY_TR_ID_LEN 2
 
 struct ieee80211_mgmt {
@@ -1070,6 +1084,7 @@ enum ieee80211_eid {
 	WLAN_EID_PREQ = 68,
 	WLAN_EID_PREP = 69,
 	WLAN_EID_PERR = 70,
+	WLAN_EID_RANN = 49,	/* compatible with FreeBSD */
 	/* 802.11h */
 	WLAN_EID_PWR_CONSTRAINT = 32,
 	WLAN_EID_PWR_CAPABILITY = 33,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 19b0c128d940..2a7d3d4067e1 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -804,6 +804,7 @@ struct ieee802_11_elems {
 	u8 *preq;
 	u8 *prep;
 	u8 *perr;
+	struct ieee80211_rann_ie *rann;
 	u8 *ch_switch_elem;
 	u8 *country_elem;
 	u8 *pwr_constr_elem;
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index db1a33098a88..7b9dd87cf9f2 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -28,6 +28,8 @@
 /* Reply and forward */
 #define MP_F_RF	0x2
 
+static void mesh_queue_preq(struct mesh_path *, u8);
+
 static inline u32 u32_field_get(u8 *preq_elem, int offset, bool ae)
 {
 	if (ae)
@@ -81,7 +83,8 @@ static inline u32 u32_field_get(u8 *preq_elem, int offset, bool ae)
 enum mpath_frame_type {
 	MPATH_PREQ = 0,
 	MPATH_PREP,
-	MPATH_PERR
+	MPATH_PERR,
+	MPATH_RANN
 };
 
 static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags,
@@ -109,7 +112,8 @@ static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags,
 
 	memcpy(mgmt->da, da, ETH_ALEN);
 	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
-	/* BSSID is left zeroed, wildcard value */
+	/* BSSID == SA */
+	memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN);
 	mgmt->u.action.category = MESH_PATH_SEL_CATEGORY;
 	mgmt->u.action.u.mesh_action.action_code = MESH_PATH_SEL_ACTION;
 
@@ -126,6 +130,12 @@ static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags,
 		pos = skb_put(skb, 2 + ie_len);
 		*pos++ = WLAN_EID_PREP;
 		break;
+	case MPATH_RANN:
+		mhwmp_dbg("sending RANN from %pM\n", orig_addr);
+		ie_len = sizeof(struct ieee80211_rann_ie);
+		pos = skb_put(skb, 2 + ie_len);
+		*pos++ = WLAN_EID_RANN;
+		break;
 	default:
 		kfree_skb(skb);
 		return -ENOTSUPP;
@@ -143,8 +153,10 @@ static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags,
 	pos += ETH_ALEN;
 	memcpy(pos, &orig_dsn, 4);
 	pos += 4;
-	memcpy(pos, &lifetime, 4);
-	pos += 4;
+	if (action != MPATH_RANN) {
+		memcpy(pos, &lifetime, 4);
+		pos += 4;
+	}
 	memcpy(pos, &metric, 4);
 	pos += 4;
 	if (action == MPATH_PREQ) {
@@ -152,9 +164,11 @@ static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags,
 		*pos++ = 1;
 		*pos++ = dst_flags;
 	}
-	memcpy(pos, dst, ETH_ALEN);
-	pos += ETH_ALEN;
-	memcpy(pos, &dst_dsn, 4);
+	if (action != MPATH_RANN) {
+		memcpy(pos, dst, ETH_ALEN);
+		pos += ETH_ALEN;
+		memcpy(pos, &dst_dsn, 4);
+	}
 
 	ieee80211_tx_skb(sdata, skb, 1);
 	return 0;
@@ -610,6 +624,54 @@ static void hwmp_perr_frame_process(struct ieee80211_sub_if_data *sdata,
 	rcu_read_unlock();
 }
 
+static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata,
+				struct ieee80211_mgmt *mgmt,
+				struct ieee80211_rann_ie *rann)
+{
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+	struct mesh_path *mpath;
+	u8 *ta;
+	u8 ttl, flags, hopcount;
+	u8 *orig_addr;
+	u32 orig_dsn, metric;
+
+	ta = mgmt->sa;
+	ttl = rann->rann_ttl;
+	if (ttl <= 1) {
+		ifmsh->mshstats.dropped_frames_ttl++;
+		return;
+	}
+	ttl--;
+	flags = rann->rann_flags;
+	orig_addr = rann->rann_addr;
+	orig_dsn = rann->rann_seq;
+	hopcount = rann->rann_hopcount;
+	metric = rann->rann_metric;
+	mhwmp_dbg("received RANN from %pM\n", orig_addr);
+
+	rcu_read_lock();
+	mpath = mesh_path_lookup(orig_addr, sdata);
+	if (!mpath) {
+		mesh_path_add(orig_addr, sdata);
+		mpath = mesh_path_lookup(orig_addr, sdata);
+		if (!mpath) {
+			rcu_read_unlock();
+			sdata->u.mesh.mshstats.dropped_frames_no_route++;
+			return;
+		}
+		mesh_queue_preq(mpath,
+				PREQ_Q_F_START | PREQ_Q_F_REFRESH);
+	}
+	if (mpath->dsn < orig_dsn) {
+		mesh_path_sel_frame_tx(MPATH_RANN, flags, orig_addr,
+				       cpu_to_le32(orig_dsn),
+				       0, NULL, 0, sdata->dev->broadcast,
+				       hopcount, ttl, 0, cpu_to_le32(metric),
+				       0, sdata);
+		mpath->dsn = orig_dsn;
+	}
+	rcu_read_unlock();
+}
 
 
 void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata,
@@ -654,7 +716,8 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata,
 			return;
 		hwmp_perr_frame_process(sdata, mgmt, elems.perr);
 	}
-
+	if (elems.rann)
+		hwmp_rann_frame_process(sdata, mgmt, elems.rann);
 }
 
 /**
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index aedbaaa067e6..da86e1592f8c 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -685,6 +685,10 @@ u32 ieee802_11_parse_elems_crc(u8 *start, size_t len,
 			elems->perr = pos;
 			elems->perr_len = elen;
 			break;
+		case WLAN_EID_RANN:
+			if (elen >= sizeof(struct ieee80211_rann_ie))
+				elems->rann = (void *)pos;
+			break;
 		case WLAN_EID_CHANNEL_SWITCH:
 			elems->ch_switch_elem = pos;
 			elems->ch_switch_elem_len = elen;
-- 
cgit v1.2.3


From d19b3bf6384e66ac6e11a61ee31ed2cfe149f4d8 Mon Sep 17 00:00:00 2001
From: Rui Paulo <rpaulo@gmail.com>
Date: Mon, 9 Nov 2009 23:46:55 +0000
Subject: mac80211: replace "destination" with "target" to follow the spec

Resulting object files have the same MD5 as before.

Signed-off-by: Rui Paulo <rpaulo@gmail.com>
Signed-off-by: Javier Cardona <javier@cozybit.com>
Reviewed-by: Andrey Yurovsky <andrey@cozybit.com>
Tested-by: Brian Cavagnolo <brian@cozybit.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h     |   8 +-
 include/net/cfg80211.h      |   8 +-
 net/mac80211/cfg.c          |   8 +-
 net/mac80211/ieee80211_i.h  |  10 +-
 net/mac80211/mesh.c         |   2 +-
 net/mac80211/mesh.h         |  10 +-
 net/mac80211/mesh_hwmp.c    | 221 ++++++++++++++++++++++----------------------
 net/mac80211/mesh_pathtbl.c |  12 +--
 net/wireless/nl80211.c      |   6 +-
 9 files changed, 144 insertions(+), 141 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 203adef972cf..7a0bd6ea6f63 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -899,14 +899,14 @@ enum nl80211_sta_info {
  *
  * @NL80211_MPATH_FLAG_ACTIVE: the mesh path is active
  * @NL80211_MPATH_FLAG_RESOLVING: the mesh path discovery process is running
- * @NL80211_MPATH_FLAG_DSN_VALID: the mesh path contains a valid DSN
+ * @NL80211_MPATH_FLAG_SN_VALID: the mesh path contains a valid SN
  * @NL80211_MPATH_FLAG_FIXED: the mesh path has been manually set
  * @NL80211_MPATH_FLAG_RESOLVED: the mesh path discovery process succeeded
  */
 enum nl80211_mpath_flags {
 	NL80211_MPATH_FLAG_ACTIVE =	1<<0,
 	NL80211_MPATH_FLAG_RESOLVING =	1<<1,
-	NL80211_MPATH_FLAG_DSN_VALID =	1<<2,
+	NL80211_MPATH_FLAG_SN_VALID =	1<<2,
 	NL80211_MPATH_FLAG_FIXED =	1<<3,
 	NL80211_MPATH_FLAG_RESOLVED =	1<<4,
 };
@@ -919,7 +919,7 @@ enum nl80211_mpath_flags {
  *
  * @__NL80211_MPATH_INFO_INVALID: attribute number 0 is reserved
  * @NL80211_ATTR_MPATH_FRAME_QLEN: number of queued frames for this destination
- * @NL80211_ATTR_MPATH_DSN: destination sequence number
+ * @NL80211_ATTR_MPATH_SN: destination sequence number
  * @NL80211_ATTR_MPATH_METRIC: metric (cost) of this mesh path
  * @NL80211_ATTR_MPATH_EXPTIME: expiration time for the path, in msec from now
  * @NL80211_ATTR_MPATH_FLAGS: mesh path flags, enumerated in
@@ -930,7 +930,7 @@ enum nl80211_mpath_flags {
 enum nl80211_mpath_info {
 	__NL80211_MPATH_INFO_INVALID,
 	NL80211_MPATH_INFO_FRAME_QLEN,
-	NL80211_MPATH_INFO_DSN,
+	NL80211_MPATH_INFO_SN,
 	NL80211_MPATH_INFO_METRIC,
 	NL80211_MPATH_INFO_EXPTIME,
 	NL80211_MPATH_INFO_FLAGS,
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 1ee41e4a92e2..7d0ae9c18286 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -420,7 +420,7 @@ enum monitor_flags {
  * in during get_station() or dump_station().
  *
  * MPATH_INFO_FRAME_QLEN: @frame_qlen filled
- * MPATH_INFO_DSN: @dsn filled
+ * MPATH_INFO_SN: @sn filled
  * MPATH_INFO_METRIC: @metric filled
  * MPATH_INFO_EXPTIME: @exptime filled
  * MPATH_INFO_DISCOVERY_TIMEOUT: @discovery_timeout filled
@@ -429,7 +429,7 @@ enum monitor_flags {
  */
 enum mpath_info_flags {
 	MPATH_INFO_FRAME_QLEN		= BIT(0),
-	MPATH_INFO_DSN			= BIT(1),
+	MPATH_INFO_SN			= BIT(1),
 	MPATH_INFO_METRIC		= BIT(2),
 	MPATH_INFO_EXPTIME		= BIT(3),
 	MPATH_INFO_DISCOVERY_TIMEOUT	= BIT(4),
@@ -444,7 +444,7 @@ enum mpath_info_flags {
  *
  * @filled: bitfield of flags from &enum mpath_info_flags
  * @frame_qlen: number of queued frames for this destination
- * @dsn: destination sequence number
+ * @sn: target sequence number
  * @metric: metric (cost) of this mesh path
  * @exptime: expiration time for the mesh path from now, in msecs
  * @flags: mesh path flags
@@ -458,7 +458,7 @@ enum mpath_info_flags {
 struct mpath_info {
 	u32 filled;
 	u32 frame_qlen;
-	u32 dsn;
+	u32 sn;
 	u32 metric;
 	u32 exptime;
 	u32 discovery_timeout;
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 576b86f81d1b..81053587e72b 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -935,7 +935,7 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop,
 	pinfo->generation = mesh_paths_generation;
 
 	pinfo->filled = MPATH_INFO_FRAME_QLEN |
-			MPATH_INFO_DSN |
+			MPATH_INFO_SN |
 			MPATH_INFO_METRIC |
 			MPATH_INFO_EXPTIME |
 			MPATH_INFO_DISCOVERY_TIMEOUT |
@@ -943,7 +943,7 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop,
 			MPATH_INFO_FLAGS;
 
 	pinfo->frame_qlen = mpath->frame_queue.qlen;
-	pinfo->dsn = mpath->dsn;
+	pinfo->sn = mpath->sn;
 	pinfo->metric = mpath->metric;
 	if (time_before(jiffies, mpath->exp_time))
 		pinfo->exptime = jiffies_to_msecs(mpath->exp_time - jiffies);
@@ -955,8 +955,8 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop,
 		pinfo->flags |= NL80211_MPATH_FLAG_ACTIVE;
 	if (mpath->flags & MESH_PATH_RESOLVING)
 		pinfo->flags |= NL80211_MPATH_FLAG_RESOLVING;
-	if (mpath->flags & MESH_PATH_DSN_VALID)
-		pinfo->flags |= NL80211_MPATH_FLAG_DSN_VALID;
+	if (mpath->flags & MESH_PATH_SN_VALID)
+		pinfo->flags |= NL80211_MPATH_FLAG_SN_VALID;
 	if (mpath->flags & MESH_PATH_FIXED)
 		pinfo->flags |= NL80211_MPATH_FLAG_FIXED;
 	if (mpath->flags & MESH_PATH_RESOLVING)
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 2a7d3d4067e1..fb54ddb9b27d 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -373,14 +373,14 @@ struct ieee80211_if_mesh {
 	u8 mesh_sp_id;
 	/* Authentication Protocol Identifier */
 	u8 mesh_auth_id;
-	/* Local mesh Destination Sequence Number */
-	u32 dsn;
+	/* Local mesh Sequence Number */
+	u32 sn;
 	/* Last used PREQ ID */
 	u32 preq_id;
 	atomic_t mpaths;
-	/* Timestamp of last DSN update */
-	unsigned long last_dsn_update;
-	/* Timestamp of last DSN sent */
+	/* Timestamp of last SN update */
+	unsigned long last_sn_update;
+	/* Timestamp of last SN sent */
 	unsigned long last_preq;
 	struct mesh_rmc *rmc;
 	spinlock_t mesh_preq_queue_lock;
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 222a31338390..0f3e1147ec4f 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -672,7 +672,7 @@ void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata)
 		MESH_MIN_DISCOVERY_TIMEOUT;
 	ifmsh->accepting_plinks = true;
 	ifmsh->preq_id = 0;
-	ifmsh->dsn = 0;
+	ifmsh->sn = 0;
 	atomic_set(&ifmsh->mpaths, 0);
 	mesh_rmc_init(sdata);
 	ifmsh->last_preq = jiffies;
diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h
index 1d534c702406..ee687add3927 100644
--- a/net/mac80211/mesh.h
+++ b/net/mac80211/mesh.h
@@ -26,7 +26,7 @@
  *
  * @MESH_PATH_ACTIVE: the mesh path can be used for forwarding
  * @MESH_PATH_RESOLVING: the discovery process is running for this mesh path
- * @MESH_PATH_DSN_VALID: the mesh path contains a valid destination sequence
+ * @MESH_PATH_SN_VALID: the mesh path contains a valid destination sequence
  * 	number
  * @MESH_PATH_FIXED: the mesh path has been manually set and should not be
  * 	modified
@@ -38,7 +38,7 @@
 enum mesh_path_flags {
 	MESH_PATH_ACTIVE =	BIT(0),
 	MESH_PATH_RESOLVING =	BIT(1),
-	MESH_PATH_DSN_VALID =	BIT(2),
+	MESH_PATH_SN_VALID =	BIT(2),
 	MESH_PATH_FIXED	=	BIT(3),
 	MESH_PATH_RESOLVED =	BIT(4),
 };
@@ -70,7 +70,7 @@ enum mesh_deferred_task_flags {
  * @timer: mesh path discovery timer
  * @frame_queue: pending queue for frames sent to this destination while the
  * 	path is unresolved
- * @dsn: destination sequence number of the destination
+ * @sn: target sequence number
  * @metric: current metric to this destination
  * @hop_count: hops to destination
  * @exp_time: in jiffies, when the path will expire or when it expired
@@ -94,7 +94,7 @@ struct mesh_path {
 	struct timer_list timer;
 	struct sk_buff_head frame_queue;
 	struct rcu_head rcu;
-	u32 dsn;
+	u32 sn;
 	u32 metric;
 	u8 hop_count;
 	unsigned long exp_time;
@@ -280,7 +280,7 @@ void mesh_mpp_table_grow(void);
 u32 mesh_table_hash(u8 *addr, struct ieee80211_sub_if_data *sdata,
 		struct mesh_table *tbl);
 /* Mesh paths */
-int mesh_path_error_tx(u8 ttl, u8 *dest, __le32 dest_dsn, __le16 dest_rcode,
+int mesh_path_error_tx(u8 ttl, u8 *target, __le32 target_sn, __le16 target_rcode,
 		u8 *ra, struct ieee80211_sub_if_data *sdata);
 void mesh_path_assign_nexthop(struct mesh_path *mpath, struct sta_info *sta);
 void mesh_path_flush_pending(struct mesh_path *mpath);
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index 7d36f3a741a5..f03a27dfd616 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -56,33 +56,33 @@ static inline u32 u16_field_get(u8 *preq_elem, int offset, bool ae)
 #define PREQ_IE_TTL(x)		(*(x + 2))
 #define PREQ_IE_PREQ_ID(x)	u32_field_get(x, 3, 0)
 #define PREQ_IE_ORIG_ADDR(x)	(x + 7)
-#define PREQ_IE_ORIG_DSN(x)	u32_field_get(x, 13, 0);
+#define PREQ_IE_ORIG_SN(x)	u32_field_get(x, 13, 0);
 #define PREQ_IE_LIFETIME(x)	u32_field_get(x, 17, AE_F_SET(x));
 #define PREQ_IE_METRIC(x) 	u32_field_get(x, 21, AE_F_SET(x));
-#define PREQ_IE_DST_F(x)	(*(AE_F_SET(x) ? x + 32 : x + 26))
-#define PREQ_IE_DST_ADDR(x) 	(AE_F_SET(x) ? x + 33 : x + 27)
-#define PREQ_IE_DST_DSN(x) 	u32_field_get(x, 33, AE_F_SET(x));
+#define PREQ_IE_TARGET_F(x)	(*(AE_F_SET(x) ? x + 32 : x + 26))
+#define PREQ_IE_TARGET_ADDR(x) 	(AE_F_SET(x) ? x + 33 : x + 27)
+#define PREQ_IE_TARGET_SN(x) 	u32_field_get(x, 33, AE_F_SET(x));
 
 
 #define PREP_IE_FLAGS(x)	PREQ_IE_FLAGS(x)
 #define PREP_IE_HOPCOUNT(x)	PREQ_IE_HOPCOUNT(x)
 #define PREP_IE_TTL(x)		PREQ_IE_TTL(x)
 #define PREP_IE_ORIG_ADDR(x)	(x + 3)
-#define PREP_IE_ORIG_DSN(x)	u32_field_get(x, 9, 0);
+#define PREP_IE_ORIG_SN(x)	u32_field_get(x, 9, 0);
 #define PREP_IE_LIFETIME(x)	u32_field_get(x, 13, AE_F_SET(x));
 #define PREP_IE_METRIC(x)	u32_field_get(x, 17, AE_F_SET(x));
-#define PREP_IE_DST_ADDR(x)	(AE_F_SET(x) ? x + 27 : x + 21)
-#define PREP_IE_DST_DSN(x)	u32_field_get(x, 27, AE_F_SET(x));
+#define PREP_IE_TARGET_ADDR(x)	(AE_F_SET(x) ? x + 27 : x + 21)
+#define PREP_IE_TARGET_SN(x)	u32_field_get(x, 27, AE_F_SET(x));
 
 #define PERR_IE_TTL(x)		(*(x))
-#define PERR_IE_DST_FLAGS(x)	(*(x + 2))
-#define PERR_IE_DST_ADDR(x)	(x + 3)
-#define PERR_IE_DST_DSN(x)	u32_field_get(x, 9, 0);
-#define PERR_IE_DST_RCODE(x)	u16_field_get(x, 13, 0);
+#define PERR_IE_TARGET_FLAGS(x)	(*(x + 2))
+#define PERR_IE_TARGET_ADDR(x)	(x + 3)
+#define PERR_IE_TARGET_SN(x)	u32_field_get(x, 9, 0);
+#define PERR_IE_TARGET_RCODE(x)	u16_field_get(x, 13, 0);
 
 #define MSEC_TO_TU(x) (x*1000/1024)
-#define DSN_GT(x, y) ((long) (y) - (long) (x) < 0)
-#define DSN_LT(x, y) ((long) (x) - (long) (y) < 0)
+#define SN_GT(x, y) ((long) (y) - (long) (x) < 0)
+#define SN_LT(x, y) ((long) (x) - (long) (y) < 0)
 
 #define net_traversal_jiffies(s) \
 	msecs_to_jiffies(s->u.mesh.mshcfg.dot11MeshHWMPnetDiameterTraversalTime)
@@ -102,9 +102,10 @@ enum mpath_frame_type {
 };
 
 static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags,
-		u8 *orig_addr, __le32 orig_dsn, u8 dst_flags, u8 *dst,
-		__le32 dst_dsn, u8 *da, u8 hop_count, u8 ttl, __le32 lifetime,
-		__le32 metric, __le32 preq_id, struct ieee80211_sub_if_data *sdata)
+		u8 *orig_addr, __le32 orig_sn, u8 target_flags, u8 *target,
+		__le32 target_sn, u8 *da, u8 hop_count, u8 ttl,__le32 lifetime,
+		__le32 metric, __le32 preq_id,
+		struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_local *local = sdata->local;
 	struct sk_buff *skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400);
@@ -133,13 +134,13 @@ static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags,
 
 	switch (action) {
 	case MPATH_PREQ:
-		mhwmp_dbg("sending PREQ to %pM\n", dst);
+		mhwmp_dbg("sending PREQ to %pM\n", target);
 		ie_len = 37;
 		pos = skb_put(skb, 2 + ie_len);
 		*pos++ = WLAN_EID_PREQ;
 		break;
 	case MPATH_PREP:
-		mhwmp_dbg("sending PREP to %pM\n", dst);
+		mhwmp_dbg("sending PREP to %pM\n", target);
 		ie_len = 31;
 		pos = skb_put(skb, 2 + ie_len);
 		*pos++ = WLAN_EID_PREP;
@@ -165,7 +166,7 @@ static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags,
 	}
 	memcpy(pos, orig_addr, ETH_ALEN);
 	pos += ETH_ALEN;
-	memcpy(pos, &orig_dsn, 4);
+	memcpy(pos, &orig_sn, 4);
 	pos += 4;
 	if (action != MPATH_RANN) {
 		memcpy(pos, &lifetime, 4);
@@ -176,12 +177,12 @@ static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags,
 	if (action == MPATH_PREQ) {
 		/* destination count */
 		*pos++ = 1;
-		*pos++ = dst_flags;
+		*pos++ = target_flags;
 	}
 	if (action != MPATH_RANN) {
-		memcpy(pos, dst, ETH_ALEN);
+		memcpy(pos, target, ETH_ALEN);
 		pos += ETH_ALEN;
-		memcpy(pos, &dst_dsn, 4);
+		memcpy(pos, &target_sn, 4);
 	}
 
 	ieee80211_tx_skb(sdata, skb, 1);
@@ -191,12 +192,14 @@ static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags,
 /**
  * mesh_send_path error - Sends a PERR mesh management frame
  *
- * @dst: broken destination
- * @dst_dsn: dsn of the broken destination
+ * @target: broken destination
+ * @target_sn: SN of the broken destination
+ * @target_rcode: reason code for this PERR
  * @ra: node this frame is addressed to
  */
-int mesh_path_error_tx(u8 ttl, u8 *dst, __le32 dst_dsn, __le16 dst_rcode,
-		u8 *ra, struct ieee80211_sub_if_data *sdata)
+int mesh_path_error_tx(u8 ttl, u8 *target, __le32 target_sn,
+		__le16 target_rcode, u8 *ra,
+		struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_local *local = sdata->local;
 	struct sk_buff *skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400);
@@ -234,16 +237,16 @@ int mesh_path_error_tx(u8 ttl, u8 *dst, __le32 dst_dsn, __le16 dst_rcode,
 	 * bit 2 is set if we have a reason code
 	 */
 	*pos = 0;
-	if (!dst_dsn)
+	if (!target_sn)
 		*pos |= MP_F_USN;
-	if (dst_rcode)
+	if (target_rcode)
 		*pos |= MP_F_RCODE;
 	pos++;
-	memcpy(pos, dst, ETH_ALEN);
+	memcpy(pos, target, ETH_ALEN);
 	pos += ETH_ALEN;
-	memcpy(pos, &dst_dsn, 4);
+	memcpy(pos, &target_sn, 4);
 	pos += 4;
-	memcpy(pos, &dst_rcode, 2);
+	memcpy(pos, &target_rcode, 2);
 
 	ieee80211_tx_skb(sdata, skb, 1);
 	return 0;
@@ -324,7 +327,7 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata,
 	struct sta_info *sta;
 	bool fresh_info;
 	u8 *orig_addr, *ta;
-	u32 orig_dsn, orig_metric;
+	u32 orig_sn, orig_metric;
 	unsigned long orig_lifetime, exp_time;
 	u32 last_hop_metric, new_metric;
 	bool process = true;
@@ -343,7 +346,7 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata,
 	switch (action) {
 	case MPATH_PREQ:
 		orig_addr = PREQ_IE_ORIG_ADDR(hwmp_ie);
-		orig_dsn = PREQ_IE_ORIG_DSN(hwmp_ie);
+		orig_sn = PREQ_IE_ORIG_SN(hwmp_ie);
 		orig_lifetime = PREQ_IE_LIFETIME(hwmp_ie);
 		orig_metric = PREQ_IE_METRIC(hwmp_ie);
 		break;
@@ -356,7 +359,7 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata,
 		 * information from both PREQ and PREP frames.
 		 */
 		orig_addr = PREP_IE_ORIG_ADDR(hwmp_ie);
-		orig_dsn = PREP_IE_ORIG_DSN(hwmp_ie);
+		orig_sn = PREP_IE_ORIG_SN(hwmp_ie);
 		orig_lifetime = PREP_IE_LIFETIME(hwmp_ie);
 		orig_metric = PREP_IE_METRIC(hwmp_ie);
 		break;
@@ -382,9 +385,9 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata,
 			if (mpath->flags & MESH_PATH_FIXED)
 				fresh_info = false;
 			else if ((mpath->flags & MESH_PATH_ACTIVE) &&
-			    (mpath->flags & MESH_PATH_DSN_VALID)) {
-				if (DSN_GT(mpath->dsn, orig_dsn) ||
-				    (mpath->dsn == orig_dsn &&
+			    (mpath->flags & MESH_PATH_SN_VALID)) {
+				if (SN_GT(mpath->sn, orig_sn) ||
+				    (mpath->sn == orig_sn &&
 				     action == MPATH_PREQ &&
 				     new_metric > mpath->metric)) {
 					process = false;
@@ -403,9 +406,9 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata,
 
 		if (fresh_info) {
 			mesh_path_assign_nexthop(mpath, sta);
-			mpath->flags |= MESH_PATH_DSN_VALID;
+			mpath->flags |= MESH_PATH_SN_VALID;
 			mpath->metric = new_metric;
-			mpath->dsn = orig_dsn;
+			mpath->sn = orig_sn;
 			mpath->exp_time = time_after(mpath->exp_time, exp_time)
 					  ?  mpath->exp_time : exp_time;
 			mesh_path_activate(mpath);
@@ -444,7 +447,7 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata,
 
 		if (fresh_info) {
 			mesh_path_assign_nexthop(mpath, sta);
-			mpath->flags &= ~MESH_PATH_DSN_VALID;
+			mpath->flags &= ~MESH_PATH_SN_VALID;
 			mpath->metric = last_hop_metric;
 			mpath->exp_time = time_after(mpath->exp_time, exp_time)
 					  ?  mpath->exp_time : exp_time;
@@ -466,47 +469,47 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
 {
 	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
 	struct mesh_path *mpath;
-	u8 *dst_addr, *orig_addr;
-	u8 dst_flags, ttl;
-	u32 orig_dsn, dst_dsn, lifetime;
+	u8 *target_addr, *orig_addr;
+	u8 target_flags, ttl;
+	u32 orig_sn, target_sn, lifetime;
 	bool reply = false;
 	bool forward = true;
 
-	/* Update destination DSN, if present */
-	dst_addr = PREQ_IE_DST_ADDR(preq_elem);
+	/* Update target SN, if present */
+	target_addr = PREQ_IE_TARGET_ADDR(preq_elem);
 	orig_addr = PREQ_IE_ORIG_ADDR(preq_elem);
-	dst_dsn = PREQ_IE_DST_DSN(preq_elem);
-	orig_dsn = PREQ_IE_ORIG_DSN(preq_elem);
-	dst_flags = PREQ_IE_DST_F(preq_elem);
+	target_sn = PREQ_IE_TARGET_SN(preq_elem);
+	orig_sn = PREQ_IE_ORIG_SN(preq_elem);
+	target_flags = PREQ_IE_TARGET_F(preq_elem);
 
 	mhwmp_dbg("received PREQ from %pM\n", orig_addr);
 
-	if (memcmp(dst_addr, sdata->dev->dev_addr, ETH_ALEN) == 0) {
+	if (memcmp(target_addr, sdata->dev->dev_addr, ETH_ALEN) == 0) {
 		mhwmp_dbg("PREQ is for us\n");
 		forward = false;
 		reply = true;
 		metric = 0;
-		if (time_after(jiffies, ifmsh->last_dsn_update +
+		if (time_after(jiffies, ifmsh->last_sn_update +
 					net_traversal_jiffies(sdata)) ||
-		    time_before(jiffies, ifmsh->last_dsn_update)) {
-			dst_dsn = ++ifmsh->dsn;
-			ifmsh->last_dsn_update = jiffies;
+		    time_before(jiffies, ifmsh->last_sn_update)) {
+			target_sn = ++ifmsh->sn;
+			ifmsh->last_sn_update = jiffies;
 		}
 	} else {
 		rcu_read_lock();
-		mpath = mesh_path_lookup(dst_addr, sdata);
+		mpath = mesh_path_lookup(target_addr, sdata);
 		if (mpath) {
-			if ((!(mpath->flags & MESH_PATH_DSN_VALID)) ||
-					DSN_LT(mpath->dsn, dst_dsn)) {
-				mpath->dsn = dst_dsn;
-				mpath->flags |= MESH_PATH_DSN_VALID;
-			} else if ((!(dst_flags & MP_F_DO)) &&
+			if ((!(mpath->flags & MESH_PATH_SN_VALID)) ||
+					SN_LT(mpath->sn, target_sn)) {
+				mpath->sn = target_sn;
+				mpath->flags |= MESH_PATH_SN_VALID;
+			} else if ((!(target_flags & MP_F_DO)) &&
 					(mpath->flags & MESH_PATH_ACTIVE)) {
 				reply = true;
 				metric = mpath->metric;
-				dst_dsn = mpath->dsn;
-				if (dst_flags & MP_F_RF)
-					dst_flags |= MP_F_DO;
+				target_sn = mpath->sn;
+				if (target_flags & MP_F_RF)
+					target_flags |= MP_F_DO;
 				else
 					forward = false;
 			}
@@ -519,9 +522,9 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
 		ttl = ifmsh->mshcfg.dot11MeshTTL;
 		if (ttl != 0) {
 			mhwmp_dbg("replying to the PREQ\n");
-			mesh_path_sel_frame_tx(MPATH_PREP, 0, dst_addr,
-				cpu_to_le32(dst_dsn), 0, orig_addr,
-				cpu_to_le32(orig_dsn), mgmt->sa, 0, ttl,
+			mesh_path_sel_frame_tx(MPATH_PREP, 0, target_addr,
+				cpu_to_le32(target_sn), 0, orig_addr,
+				cpu_to_le32(orig_sn), mgmt->sa, 0, ttl,
 				cpu_to_le32(lifetime), cpu_to_le32(metric),
 				0, sdata);
 		} else
@@ -544,8 +547,8 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
 		preq_id = PREQ_IE_PREQ_ID(preq_elem);
 		hopcount = PREQ_IE_HOPCOUNT(preq_elem) + 1;
 		mesh_path_sel_frame_tx(MPATH_PREQ, flags, orig_addr,
-				cpu_to_le32(orig_dsn), dst_flags, dst_addr,
-				cpu_to_le32(dst_dsn), sdata->dev->broadcast,
+				cpu_to_le32(orig_sn), target_flags, target_addr,
+				cpu_to_le32(target_sn), sdata->dev->broadcast,
 				hopcount, ttl, cpu_to_le32(lifetime),
 				cpu_to_le32(metric), cpu_to_le32(preq_id),
 				sdata);
@@ -560,10 +563,10 @@ static void hwmp_prep_frame_process(struct ieee80211_sub_if_data *sdata,
 				    u8 *prep_elem, u32 metric)
 {
 	struct mesh_path *mpath;
-	u8 *dst_addr, *orig_addr;
+	u8 *target_addr, *orig_addr;
 	u8 ttl, hopcount, flags;
 	u8 next_hop[ETH_ALEN];
-	u32 dst_dsn, orig_dsn, lifetime;
+	u32 target_sn, orig_sn, lifetime;
 
 	mhwmp_dbg("received PREP from %pM\n", PREP_IE_ORIG_ADDR(prep_elem));
 
@@ -573,8 +576,8 @@ static void hwmp_prep_frame_process(struct ieee80211_sub_if_data *sdata,
 	 * which corresponds with the originator of the PREQ which this PREP
 	 * replies
 	 */
-	dst_addr = PREP_IE_DST_ADDR(prep_elem);
-	if (memcmp(dst_addr, sdata->dev->dev_addr, ETH_ALEN) == 0)
+	target_addr = PREP_IE_TARGET_ADDR(prep_elem);
+	if (memcmp(target_addr, sdata->dev->dev_addr, ETH_ALEN) == 0)
 		/* destination, no forwarding required */
 		return;
 
@@ -585,7 +588,7 @@ static void hwmp_prep_frame_process(struct ieee80211_sub_if_data *sdata,
 	}
 
 	rcu_read_lock();
-	mpath = mesh_path_lookup(dst_addr, sdata);
+	mpath = mesh_path_lookup(target_addr, sdata);
 	if (mpath)
 		spin_lock_bh(&mpath->state_lock);
 	else
@@ -601,13 +604,13 @@ static void hwmp_prep_frame_process(struct ieee80211_sub_if_data *sdata,
 	lifetime = PREP_IE_LIFETIME(prep_elem);
 	hopcount = PREP_IE_HOPCOUNT(prep_elem) + 1;
 	orig_addr = PREP_IE_ORIG_ADDR(prep_elem);
-	dst_dsn = PREP_IE_DST_DSN(prep_elem);
-	orig_dsn = PREP_IE_ORIG_DSN(prep_elem);
+	target_sn = PREP_IE_TARGET_SN(prep_elem);
+	orig_sn = PREP_IE_ORIG_SN(prep_elem);
 
 	mesh_path_sel_frame_tx(MPATH_PREP, flags, orig_addr,
-		cpu_to_le32(orig_dsn), 0, dst_addr,
-		cpu_to_le32(dst_dsn), mpath->next_hop->sta.addr, hopcount, ttl,
-		cpu_to_le32(lifetime), cpu_to_le32(metric),
+		cpu_to_le32(orig_sn), 0, target_addr,
+		cpu_to_le32(target_sn), mpath->next_hop->sta.addr, hopcount,
+		ttl, cpu_to_le32(lifetime), cpu_to_le32(metric),
 		0, sdata);
 	rcu_read_unlock();
 
@@ -627,10 +630,10 @@ static void hwmp_perr_frame_process(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
 	struct mesh_path *mpath;
 	u8 ttl;
-	u8 *ta, *dst_addr;
-	u8 dst_flags;
-	u32 dst_dsn;
-	u16 dst_rcode;
+	u8 *ta, *target_addr;
+	u8 target_flags;
+	u32 target_sn;
+	u16 target_rcode;
 
 	ta = mgmt->sa;
 	ttl = PERR_IE_TTL(perr_elem);
@@ -639,24 +642,24 @@ static void hwmp_perr_frame_process(struct ieee80211_sub_if_data *sdata,
 		return;
 	}
 	ttl--;
-	dst_flags = PERR_IE_DST_FLAGS(perr_elem);
-	dst_addr = PERR_IE_DST_ADDR(perr_elem);
-	dst_dsn = PERR_IE_DST_DSN(perr_elem);
-	dst_rcode = PERR_IE_DST_RCODE(perr_elem);
+	target_flags = PERR_IE_TARGET_FLAGS(perr_elem);
+	target_addr = PERR_IE_TARGET_ADDR(perr_elem);
+	target_sn = PERR_IE_TARGET_SN(perr_elem);
+	target_rcode = PERR_IE_TARGET_RCODE(perr_elem);
 
 	rcu_read_lock();
-	mpath = mesh_path_lookup(dst_addr, sdata);
+	mpath = mesh_path_lookup(target_addr, sdata);
 	if (mpath) {
 		spin_lock_bh(&mpath->state_lock);
 		if (mpath->flags & MESH_PATH_ACTIVE &&
 		    memcmp(ta, mpath->next_hop->sta.addr, ETH_ALEN) == 0 &&
-		    (!(mpath->flags & MESH_PATH_DSN_VALID) ||
-		    DSN_GT(dst_dsn, mpath->dsn))) {
+		    (!(mpath->flags & MESH_PATH_SN_VALID) ||
+		    SN_GT(target_sn, mpath->sn))) {
 			mpath->flags &= ~MESH_PATH_ACTIVE;
-			mpath->dsn = dst_dsn;
+			mpath->sn = target_sn;
 			spin_unlock_bh(&mpath->state_lock);
-			mesh_path_error_tx(ttl, dst_addr, cpu_to_le32(dst_dsn),
-					   cpu_to_le16(dst_rcode),
+			mesh_path_error_tx(ttl, target_addr, cpu_to_le32(target_sn),
+					   cpu_to_le16(target_rcode),
 					   sdata->dev->broadcast, sdata);
 		} else
 			spin_unlock_bh(&mpath->state_lock);
@@ -673,7 +676,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata,
 	u8 *ta;
 	u8 ttl, flags, hopcount;
 	u8 *orig_addr;
-	u32 orig_dsn, metric;
+	u32 orig_sn, metric;
 
 	ta = mgmt->sa;
 	ttl = rann->rann_ttl;
@@ -684,7 +687,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata,
 	ttl--;
 	flags = rann->rann_flags;
 	orig_addr = rann->rann_addr;
-	orig_dsn = rann->rann_seq;
+	orig_sn = rann->rann_seq;
 	hopcount = rann->rann_hopcount;
 	hopcount++;
 	metric = rann->rann_metric;
@@ -703,14 +706,14 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata,
 		mesh_queue_preq(mpath,
 				PREQ_Q_F_START | PREQ_Q_F_REFRESH);
 	}
-	if (mpath->dsn < orig_dsn) {
+	if (mpath->sn < orig_sn) {
 		mesh_path_sel_frame_tx(MPATH_RANN, flags, orig_addr,
-				       cpu_to_le32(orig_dsn),
+				       cpu_to_le32(orig_sn),
 				       0, NULL, 0, sdata->dev->broadcast,
 				       hopcount, ttl, 0,
 				       cpu_to_le32(metric + mpath->metric),
 				       0, sdata);
-		mpath->dsn = orig_dsn;
+		mpath->sn = orig_sn;
 	}
 	rcu_read_unlock();
 }
@@ -823,7 +826,7 @@ void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata)
 	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
 	struct mesh_preq_queue *preq_node;
 	struct mesh_path *mpath;
-	u8 ttl, dst_flags;
+	u8 ttl, target_flags;
 	u32 lifetime;
 
 	spin_lock_bh(&ifmsh->mesh_preq_queue_lock);
@@ -865,11 +868,11 @@ void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata)
 
 	ifmsh->last_preq = jiffies;
 
-	if (time_after(jiffies, ifmsh->last_dsn_update +
+	if (time_after(jiffies, ifmsh->last_sn_update +
 				net_traversal_jiffies(sdata)) ||
-	    time_before(jiffies, ifmsh->last_dsn_update)) {
-		++ifmsh->dsn;
-		sdata->u.mesh.last_dsn_update = jiffies;
+	    time_before(jiffies, ifmsh->last_sn_update)) {
+		++ifmsh->sn;
+		sdata->u.mesh.last_sn_update = jiffies;
 	}
 	lifetime = default_lifetime(sdata);
 	ttl = sdata->u.mesh.mshcfg.dot11MeshTTL;
@@ -880,14 +883,14 @@ void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata)
 	}
 
 	if (preq_node->flags & PREQ_Q_F_REFRESH)
-		dst_flags = MP_F_DO;
+		target_flags = MP_F_DO;
 	else
-		dst_flags = MP_F_RF;
+		target_flags = MP_F_RF;
 
 	spin_unlock_bh(&mpath->state_lock);
 	mesh_path_sel_frame_tx(MPATH_PREQ, 0, sdata->dev->dev_addr,
-			cpu_to_le32(ifmsh->dsn), dst_flags, mpath->dst,
-			cpu_to_le32(mpath->dsn), sdata->dev->broadcast, 0,
+			cpu_to_le32(ifmsh->sn), target_flags, mpath->dst,
+			cpu_to_le32(mpath->sn), sdata->dev->broadcast, 0,
 			ttl, cpu_to_le32(lifetime), 0,
 			cpu_to_le32(ifmsh->preq_id++), sdata);
 	mod_timer(&mpath->timer, jiffies + mpath->discovery_timeout);
@@ -914,15 +917,15 @@ int mesh_nexthop_lookup(struct sk_buff *skb,
 	struct sk_buff *skb_to_free = NULL;
 	struct mesh_path *mpath;
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
-	u8 *dst_addr = hdr->addr3;
+	u8 *target_addr = hdr->addr3;
 	int err = 0;
 
 	rcu_read_lock();
-	mpath = mesh_path_lookup(dst_addr, sdata);
+	mpath = mesh_path_lookup(target_addr, sdata);
 
 	if (!mpath) {
-		mesh_path_add(dst_addr, sdata);
-		mpath = mesh_path_lookup(dst_addr, sdata);
+		mesh_path_add(target_addr, sdata);
+		mpath = mesh_path_lookup(target_addr, sdata);
 		if (!mpath) {
 			sdata->u.mesh.mshstats.dropped_frames_no_route++;
 			err = -ENOSPC;
diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index 5d541235ca2f..2cebdf52b7bc 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -463,10 +463,10 @@ void mesh_plink_broken(struct sta_info *sta)
 		    mpath->flags & MESH_PATH_ACTIVE &&
 		    !(mpath->flags & MESH_PATH_FIXED)) {
 			mpath->flags &= ~MESH_PATH_ACTIVE;
-			++mpath->dsn;
+			++mpath->sn;
 			spin_unlock_bh(&mpath->state_lock);
 			mesh_path_error_tx(MESH_TTL, mpath->dst,
-					cpu_to_le32(mpath->dsn),
+					cpu_to_le32(mpath->sn),
 					PERR_RCODE_DEST_UNREACH,
 					sdata->dev->broadcast, sdata);
 		} else
@@ -602,7 +602,7 @@ void mesh_path_discard_frame(struct sk_buff *skb,
 {
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
 	struct mesh_path *mpath;
-	u32 dsn = 0;
+	u32 sn = 0;
 
 	if (memcmp(hdr->addr4, sdata->dev->dev_addr, ETH_ALEN) != 0) {
 		u8 *ra, *da;
@@ -611,8 +611,8 @@ void mesh_path_discard_frame(struct sk_buff *skb,
 		ra = hdr->addr1;
 		mpath = mesh_path_lookup(da, sdata);
 		if (mpath)
-			dsn = ++mpath->dsn;
-		mesh_path_error_tx(MESH_TTL, skb->data, cpu_to_le32(dsn),
+			sn = ++mpath->sn;
+		mesh_path_error_tx(MESH_TTL, skb->data, cpu_to_le32(sn),
 				   PERR_RCODE_NO_ROUTE, ra, sdata);
 	}
 
@@ -648,7 +648,7 @@ void mesh_path_fix_nexthop(struct mesh_path *mpath, struct sta_info *next_hop)
 {
 	spin_lock_bh(&mpath->state_lock);
 	mesh_path_assign_nexthop(mpath, next_hop);
-	mpath->dsn = 0xffff;
+	mpath->sn = 0xffff;
 	mpath->metric = 0;
 	mpath->hop_count = 0;
 	mpath->exp_time = 0;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 8c8e4eae6a17..7c1999872503 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -2116,9 +2116,9 @@ static int nl80211_send_mpath(struct sk_buff *msg, u32 pid, u32 seq,
 	if (pinfo->filled & MPATH_INFO_FRAME_QLEN)
 		NLA_PUT_U32(msg, NL80211_MPATH_INFO_FRAME_QLEN,
 			    pinfo->frame_qlen);
-	if (pinfo->filled & MPATH_INFO_DSN)
-		NLA_PUT_U32(msg, NL80211_MPATH_INFO_DSN,
-			    pinfo->dsn);
+	if (pinfo->filled & MPATH_INFO_SN)
+		NLA_PUT_U32(msg, NL80211_MPATH_INFO_SN,
+			    pinfo->sn);
 	if (pinfo->filled & MPATH_INFO_METRIC)
 		NLA_PUT_U32(msg, NL80211_MPATH_INFO_METRIC,
 			    pinfo->metric);
-- 
cgit v1.2.3


From 63c5723bc3af8d4e86984dd4ff0c78218de418d0 Mon Sep 17 00:00:00 2001
From: Rui Paulo <rpaulo@gmail.com>
Date: Mon, 9 Nov 2009 23:46:57 +0000
Subject: mac80211: add nl80211/cfg80211 handling of the new mesh root mode
 option.

Signed-off-by: Rui Paulo <rpaulo@gmail.com>
Signed-off-by: Javier Cardona <javier@cozybit.com>
Reviewed-by: Andrey Yurovsky <andrey@cozybit.com>
Tested-by: Brian Cavagnolo <brian@cozybit.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h       |  3 +++
 include/net/cfg80211.h        |  1 +
 net/mac80211/cfg.c            |  7 +++++++
 net/mac80211/debugfs_netdev.c |  2 ++
 net/mac80211/mesh.c           | 13 +++++++++++++
 net/mac80211/mesh.h           |  1 +
 net/wireless/nl80211.c        |  6 ++++++
 7 files changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 7a0bd6ea6f63..d2f276de9abe 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -1200,6 +1200,8 @@ enum nl80211_mntr_flags {
  * @NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME: The interval of time (in TUs)
  * that it takes for an HWMP information element to propagate across the mesh
  *
+ * @NL80211_MESHCONF_ROOTMODE: whether root mode is enabled or not
+ *
  * @NL80211_MESHCONF_ATTR_MAX: highest possible mesh configuration attribute
  *
  * @__NL80211_MESHCONF_ATTR_AFTER_LAST: internal use
@@ -1219,6 +1221,7 @@ enum nl80211_meshconf_params {
 	NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT,
 	NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL,
 	NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME,
+	NL80211_MESHCONF_HWMP_ROOTMODE,
 
 	/* keep last */
 	__NL80211_MESHCONF_ATTR_AFTER_LAST,
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 7d0ae9c18286..0e4c51fc63e5 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -508,6 +508,7 @@ struct mesh_config {
 	u32 dot11MeshHWMPactivePathTimeout;
 	u16 dot11MeshHWMPpreqMinInterval;
 	u16 dot11MeshHWMPnetDiameterTraversalTime;
+	u8  dot11MeshHWMPRootMode;
 };
 
 /**
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 81053587e72b..7f18c8fa1880 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1029,7 +1029,10 @@ static int ieee80211_set_mesh_params(struct wiphy *wiphy,
 {
 	struct mesh_config *conf;
 	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_if_mesh *ifmsh;
+
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	ifmsh = &sdata->u.mesh;
 
 	/* Set the config options which we are interested in setting */
 	conf = &(sdata->u.mesh.mshcfg);
@@ -1064,6 +1067,10 @@ static int ieee80211_set_mesh_params(struct wiphy *wiphy,
 			   mask))
 		conf->dot11MeshHWMPnetDiameterTraversalTime =
 			nconf->dot11MeshHWMPnetDiameterTraversalTime;
+	if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_ROOTMODE, mask)) {
+		conf->dot11MeshHWMPRootMode = nconf->dot11MeshHWMPRootMode;
+		ieee80211_mesh_root_setup(ifmsh);
+	}
 	return 0;
 }
 
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index 8782264f49e7..472b2039906c 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -149,6 +149,8 @@ IEEE80211_IF_FILE(path_refresh_time,
 		u.mesh.mshcfg.path_refresh_time, DEC);
 IEEE80211_IF_FILE(min_discovery_timeout,
 		u.mesh.mshcfg.min_discovery_timeout, DEC);
+IEEE80211_IF_FILE(dot11MeshHWMPRootMode,
+		u.mesh.mshcfg.dot11MeshHWMPRootMode, DEC);
 #endif
 
 
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 88dcfe3030b1..05955dc6b3d3 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -373,6 +373,17 @@ static void ieee80211_mesh_path_root_timer(unsigned long data)
 	ieee80211_queue_work(&local->hw, &ifmsh->work);
 }
 
+void ieee80211_mesh_root_setup(struct ieee80211_if_mesh *ifmsh)
+{
+	if (ifmsh->mshcfg.dot11MeshHWMPRootMode)
+		set_bit(MESH_WORK_ROOT, &ifmsh->wrkq_flags);
+	else {
+		clear_bit(MESH_WORK_ROOT, &ifmsh->wrkq_flags);
+		/* stop running timer */
+		del_timer_sync(&ifmsh->mesh_path_root_timer);
+	}
+}
+
 /**
  * ieee80211_fill_mesh_addresses - fill addresses of a locally originated mesh frame
  * @hdr:    	802.11 frame header
@@ -503,6 +514,7 @@ void ieee80211_mesh_restart(struct ieee80211_sub_if_data *sdata)
 		add_timer(&ifmsh->mesh_path_timer);
 	if (test_and_clear_bit(TMR_RUNNING_MPR, &ifmsh->timers_running))
 		add_timer(&ifmsh->mesh_path_root_timer);
+	ieee80211_mesh_root_setup(ifmsh);
 }
 #endif
 
@@ -512,6 +524,7 @@ void ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata)
 	struct ieee80211_local *local = sdata->local;
 
 	set_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags);
+	ieee80211_mesh_root_setup(ifmsh);
 	ieee80211_queue_work(&local->hw, &ifmsh->work);
 	sdata->vif.bss_conf.beacon_int = MESH_DEFAULT_BEACON_INTERVAL;
 	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON |
diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h
index 00ee84258196..5ad6975bf28c 100644
--- a/net/mac80211/mesh.h
+++ b/net/mac80211/mesh.h
@@ -242,6 +242,7 @@ ieee80211_rx_result
 ieee80211_mesh_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb);
 void ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata);
 void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata);
+void ieee80211_mesh_root_setup(struct ieee80211_if_mesh *ifmsh);
 
 /* Mesh paths */
 int mesh_nexthop_lookup(struct sk_buff *skb,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 7c1999872503..17bcad69428d 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -2616,6 +2616,8 @@ static int nl80211_get_mesh_params(struct sk_buff *skb,
 			cur_params.dot11MeshHWMPpreqMinInterval);
 	NLA_PUT_U16(msg, NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME,
 			cur_params.dot11MeshHWMPnetDiameterTraversalTime);
+	NLA_PUT_U8(msg, NL80211_MESHCONF_HWMP_ROOTMODE,
+			cur_params.dot11MeshHWMPRootMode);
 	nla_nest_end(msg, pinfoattr);
 	genlmsg_end(msg, hdr);
 	err = genlmsg_reply(msg, info);
@@ -2726,6 +2728,10 @@ static int nl80211_set_mesh_params(struct sk_buff *skb, struct genl_info *info)
 			dot11MeshHWMPnetDiameterTraversalTime,
 			mask, NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME,
 			nla_get_u16);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg,
+			dot11MeshHWMPRootMode, mask,
+			NL80211_MESHCONF_HWMP_ROOTMODE,
+			nla_get_u8);
 
 	/* Apply changes */
 	err = rdev->ops->set_mesh_params(&rdev->wiphy, dev, &cfg, mask);
-- 
cgit v1.2.3


From 61fa713c751683da915fa0c1aa502be85822c357 Mon Sep 17 00:00:00 2001
From: Holger Schurig <holgerschurig@gmail.com>
Date: Wed, 11 Nov 2009 12:25:40 +0100
Subject: cfg80211: return channel noise via survey API

This patch implements the NL80211_CMD_GET_SURVEY command and an get_survey()
ops that a driver can implement. The goal of this command is to allow a
drivers to report channel survey data (e.g. channel noise, channel
occupation).

For now, only the mechanism to report back channel noise has been
implemented.

In future, there will either be a survey-trigger command --- or the existing
scan-trigger command will be enhanced. This will allow user-space to
request survey for arbitrary channels.

Note: any driver that cannot report channel noise should not report
any value at all, e.g. made-up -92 dBm.

Signed-off-by: Holger Schurig <holgerschurig@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h |  34 ++++++++++++++++
 include/net/cfg80211.h  |  34 ++++++++++++++++
 net/wireless/nl80211.c  | 105 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 173 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index d2f276de9abe..45db17f81aa3 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -160,6 +160,11 @@
  * @NL80211_CMD_SCAN_ABORTED: scan was aborted, for unspecified reasons,
  *	partial scan results may be available
  *
+ * @NL80211_CMD_GET_SURVEY: get survey resuls, e.g. channel occupation
+ *      or noise level
+ * @NL80211_CMD_NEW_SURVEY_RESULTS: survey data notification (as a reply to
+ *	NL80211_CMD_GET_SURVEY and on the "scan" multicast group)
+ *
  * @NL80211_CMD_REG_CHANGE: indicates to userspace the regulatory domain
  * 	has been changed and provides details of the request information
  * 	that caused the change such as who initiated the regulatory request
@@ -341,6 +346,9 @@ enum nl80211_commands {
 
 	NL80211_CMD_SET_WIPHY_NETNS,
 
+	NL80211_CMD_GET_SURVEY,
+	NL80211_CMD_NEW_SURVEY_RESULTS,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -586,6 +594,10 @@ enum nl80211_commands {
  *
  * @NL80211_ATTR_4ADDR: Use 4-address frames on a virtual interface
  *
+ * @NL80211_ATTR_SURVEY_INFO: survey information about a channel, part of
+ *      the survey response for %NL80211_CMD_GET_SURVEY, nested attribute
+ *      containing info as possible, see &enum survey_info.
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -718,6 +730,8 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_4ADDR,
 
+	NL80211_ATTR_SURVEY_INFO,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -1120,6 +1134,26 @@ enum nl80211_reg_rule_flags {
 	NL80211_RRF_NO_IBSS		= 1<<8,
 };
 
+/**
+ * enum nl80211_survey_info - survey information
+ *
+ * These attribute types are used with %NL80211_ATTR_SURVEY_INFO
+ * when getting information about a survey.
+ *
+ * @__NL80211_SURVEY_INFO_INVALID: attribute number 0 is reserved
+ * @NL80211_SURVEY_INFO_FREQUENCY: center frequency of channel
+ * @NL80211_SURVEY_INFO_NOISE: noise level of channel (u8, dBm)
+ */
+enum nl80211_survey_info {
+	__NL80211_SURVEY_INFO_INVALID,
+	NL80211_SURVEY_INFO_FREQUENCY,
+	NL80211_SURVEY_INFO_NOISE,
+
+	/* keep last */
+	__NL80211_SURVEY_INFO_AFTER_LAST,
+	NL80211_SURVEY_INFO_MAX = __NL80211_SURVEY_INFO_AFTER_LAST - 1
+};
+
 /**
  * enum nl80211_mntr_flags - monitor configuration flags
  *
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 0e4c51fc63e5..21710fc17eaf 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -234,6 +234,35 @@ struct key_params {
 	u32 cipher;
 };
 
+/**
+ * enum survey_info_flags - survey information flags
+ *
+ * Used by the driver to indicate which info in &struct survey_info
+ * it has filled in during the get_survey().
+ */
+enum survey_info_flags {
+	SURVEY_INFO_NOISE_DBM = 1<<0,
+};
+
+/**
+ * struct survey_info - channel survey response
+ *
+ * Used by dump_survey() to report back per-channel survey information.
+ *
+ * @channel: the channel this survey record reports, mandatory
+ * @filled: bitflag of flags from &enum survey_info_flags
+ * @noise: channel noise in dBm. This and all following fields are
+ *     optional
+ *
+ * This structure can later be expanded with things like
+ * channel duty cycle etc.
+ */
+struct survey_info {
+	struct ieee80211_channel *channel;
+	u32 filled;
+	s8 noise;
+};
+
 /**
  * struct beacon_parameters - beacon parameters
  *
@@ -944,6 +973,8 @@ struct cfg80211_bitrate_mask {
  * @rfkill_poll: polls the hw rfkill line, use cfg80211 reporting
  *	functions to adjust rfkill hw state
  *
+ * @dump_survey: get site survey information.
+ *
  * @testmode_cmd: run a test mode command
  */
 struct cfg80211_ops {
@@ -1063,6 +1094,9 @@ struct cfg80211_ops {
 				    const u8 *peer,
 				    const struct cfg80211_bitrate_mask *mask);
 
+	int	(*dump_survey)(struct wiphy *wiphy, struct net_device *netdev,
+			int idx, struct survey_info *info);
+
 	/* some temporary stuff to finish wext */
 	int	(*set_power_mgmt)(struct wiphy *wiphy, struct net_device *dev,
 				  bool enabled, int timeout);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 4dc139cdba5c..5c8b3bfada4b 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3245,6 +3245,106 @@ static int nl80211_dump_scan(struct sk_buff *skb,
 	return err;
 }
 
+static int nl80211_send_survey(struct sk_buff *msg, u32 pid, u32 seq,
+				int flags, struct net_device *dev,
+				struct survey_info *survey)
+{
+	void *hdr;
+	struct nlattr *infoattr;
+
+	/* Survey without a channel doesn't make sense */
+	if (!survey->channel)
+		return -EINVAL;
+
+	hdr = nl80211hdr_put(msg, pid, seq, flags,
+			     NL80211_CMD_NEW_SURVEY_RESULTS);
+	if (!hdr)
+		return -ENOMEM;
+
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, dev->ifindex);
+
+	infoattr = nla_nest_start(msg, NL80211_ATTR_SURVEY_INFO);
+	if (!infoattr)
+		goto nla_put_failure;
+
+	NLA_PUT_U32(msg, NL80211_SURVEY_INFO_FREQUENCY,
+		    survey->channel->center_freq);
+	if (survey->filled & SURVEY_INFO_NOISE_DBM)
+		NLA_PUT_U8(msg, NL80211_SURVEY_INFO_NOISE,
+			    survey->noise);
+
+	nla_nest_end(msg, infoattr);
+
+	return genlmsg_end(msg, hdr);
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static int nl80211_dump_survey(struct sk_buff *skb,
+			struct netlink_callback *cb)
+{
+	struct survey_info survey;
+	struct cfg80211_registered_device *dev;
+	struct net_device *netdev;
+	int ifidx = cb->args[0];
+	int survey_idx = cb->args[1];
+	int res;
+
+	if (!ifidx)
+		ifidx = nl80211_get_ifidx(cb);
+	if (ifidx < 0)
+		return ifidx;
+	cb->args[0] = ifidx;
+
+	rtnl_lock();
+
+	netdev = __dev_get_by_index(sock_net(skb->sk), ifidx);
+	if (!netdev) {
+		res = -ENODEV;
+		goto out_rtnl;
+	}
+
+	dev = cfg80211_get_dev_from_ifindex(sock_net(skb->sk), ifidx);
+	if (IS_ERR(dev)) {
+		res = PTR_ERR(dev);
+		goto out_rtnl;
+	}
+
+	if (!dev->ops->dump_survey) {
+		res = -EOPNOTSUPP;
+		goto out_err;
+	}
+
+	while (1) {
+		res = dev->ops->dump_survey(&dev->wiphy, netdev, survey_idx,
+					    &survey);
+		if (res == -ENOENT)
+			break;
+		if (res)
+			goto out_err;
+
+		if (nl80211_send_survey(skb,
+				NETLINK_CB(cb->skb).pid,
+				cb->nlh->nlmsg_seq, NLM_F_MULTI,
+				netdev,
+				&survey) < 0)
+			goto out;
+		survey_idx++;
+	}
+
+ out:
+	cb->args[1] = survey_idx;
+	res = skb->len;
+ out_err:
+	cfg80211_unlock_rdev(dev);
+ out_rtnl:
+	rtnl_unlock();
+
+	return res;
+}
+
 static bool nl80211_valid_auth_type(enum nl80211_auth_type auth_type)
 {
 	return auth_type <= NL80211_AUTHTYPE_MAX;
@@ -4322,6 +4422,11 @@ static struct genl_ops nl80211_ops[] = {
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
 	},
+	{
+		.cmd = NL80211_CMD_GET_SURVEY,
+		.policy = nl80211_policy,
+		.dumpit = nl80211_dump_survey,
+	},
 };
 static struct genl_multicast_group nl80211_mlme_mcgrp = {
 	.name = "mlme",
-- 
cgit v1.2.3


From 687b16fb617bd446439425a368ad7c7bbd202c73 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 13 Nov 2009 13:16:15 +0100
Subject: hw-breakpoints: Provide an off-case for counter_arch_bp()

If an arch doesn't support the hw breakpoints, counter_arch_bp()
has no off case to cover the missing breakpoint info structure
from the perf event. The result is a build error in non-x86
configs.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Prasad <prasad@linux.vnet.ibm.com>
LKML-Reference: <1258114575-32655-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Prasad <prasad@linux.vnet.ibm.com>
---
 include/linux/hw_breakpoint.h | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index 7eba9b92e5f3..18710e0c84bd 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -16,11 +16,6 @@ enum {
 	HW_BREAKPOINT_X = 4,
 };
 
-static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp)
-{
-	return &bp->hw.info;
-}
-
 static inline unsigned long hw_breakpoint_addr(struct perf_event *bp)
 {
 	return bp->attr.bp_addr;
@@ -83,6 +78,11 @@ extern void release_bp_slot(struct perf_event *bp);
 
 extern void flush_ptrace_hw_breakpoint(struct task_struct *tsk);
 
+static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp)
+{
+	return &bp->hw.info;
+}
+
 #else /* !CONFIG_HAVE_HW_BREAKPOINT */
 
 static inline struct perf_event *
@@ -126,6 +126,11 @@ static inline void release_bp_slot(struct perf_event *bp) 		{ }
 
 static inline void flush_ptrace_hw_breakpoint(struct task_struct *tsk)	{ }
 
+static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp)
+{
+	return NULL;
+}
+
 #endif /* CONFIG_HAVE_HW_BREAKPOINT */
 
 #endif /* _LINUX_HW_BREAKPOINT_H */
-- 
cgit v1.2.3


From f6c06b6807ff9281295989ebad72523865325a4f Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg@redhat.com>
Date: Fri, 13 Nov 2009 15:14:11 -0500
Subject: vc: Add support for hiding the cursor when creating VTs

Add support for setting a global default for whether or not a visible
cursor should be enabled when creating VCs. The default will be to do so,
unless overridden by the user at boot time or by a driver.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
LKML-Reference: <1258143251-5818-1-git-send-email-mjg@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 Documentation/kernel-parameters.txt |  9 +++++++++
 drivers/char/vt.c                   | 11 ++++++++++-
 drivers/video/console/vgacon.c      |  2 ++
 include/linux/vt_kern.h             |  3 +++
 4 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9107b387e91f..dd42eb65fd27 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2729,6 +2729,15 @@ and is between 256 and 4096 characters. It is defined in the file
 			Default is 1, i.e. UTF-8 mode is enabled for all
 			newly opened terminals.
 
+	vt.global_cursor_default=
+			[VT]
+			Format=<-1|0|1>
+			Set system-wide default for whether a cursor
+			is shown on new VTs. Default is -1,
+			i.e. cursors will be created by default unless
+			overridden by individual drivers. 0 will hide
+			cursors, 1 will display them.
+
 	waveartist=	[HW,OSS]
 			Format: <io>,<irq>,<dma>,<dma2>
 
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 0c80c68cd047..1e3d728dbf7e 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -161,6 +161,8 @@ static void set_palette(struct vc_data *vc);
 static int printable;		/* Is console ready for printing? */
 int default_utf8 = true;
 module_param(default_utf8, int, S_IRUGO | S_IWUSR);
+int global_cursor_default = -1;
+module_param(global_cursor_default, int, S_IRUGO | S_IWUSR);
 
 /*
  * ignore_poke: don't unblank the screen when things are typed.  This is
@@ -775,6 +777,12 @@ int vc_allocate(unsigned int currcons)	/* return 0 on success */
 		vc_cons[currcons].d = NULL;
 		return -ENOMEM;
 	    }
+
+	    /* If no drivers have overridden us and the user didn't pass a
+	       boot option, default to displaying the cursor */
+	    if (global_cursor_default == -1)
+		    global_cursor_default = 1;
+
 	    vc_init(vc, vc->vc_rows, vc->vc_cols, 1);
 	    vcs_make_sysfs(currcons);
 	    atomic_notifier_call_chain(&vt_notifier_list, VT_ALLOCATE, &param);
@@ -1616,7 +1624,7 @@ static void reset_terminal(struct vc_data *vc, int do_clear)
 	vc->vc_decscnm		= 0;
 	vc->vc_decom		= 0;
 	vc->vc_decawm		= 1;
-	vc->vc_deccm		= 1;
+	vc->vc_deccm		= global_cursor_default;
 	vc->vc_decim		= 0;
 
 	set_kbd(vc, decarm);
@@ -4078,6 +4086,7 @@ EXPORT_SYMBOL(fg_console);
 EXPORT_SYMBOL(console_blank_hook);
 EXPORT_SYMBOL(console_blanked);
 EXPORT_SYMBOL(vc_cons);
+EXPORT_SYMBOL(global_cursor_default);
 #ifndef VT_SINGLE_DRIVER
 EXPORT_SYMBOL(take_over_console);
 EXPORT_SYMBOL(give_up_console);
diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c
index da55ccaf4d55..564643edb3c1 100644
--- a/drivers/video/console/vgacon.c
+++ b/drivers/video/console/vgacon.c
@@ -585,6 +585,8 @@ static void vgacon_init(struct vc_data *c, int init)
 	vgacon_uni_pagedir[1]++;
 	if (!vgacon_uni_pagedir[0] && p)
 		con_set_default_unimap(c);
+
+	hide_boot_cursor(screen_info.flags & VIDEO_FLAGS_NOCURSOR);
 }
 
 static void vgacon_deinit(struct vc_data *c)
diff --git a/include/linux/vt_kern.h b/include/linux/vt_kern.h
index c0c4e1103a73..7f56db4a79f0 100644
--- a/include/linux/vt_kern.h
+++ b/include/linux/vt_kern.h
@@ -110,6 +110,7 @@ extern char con_buf[CON_BUF_SIZE];
 extern struct mutex con_buf_mtx;
 extern char vt_dont_switch;
 extern int default_utf8;
+extern int global_cursor_default;
 
 struct vt_spawn_console {
 	spinlock_t lock;
@@ -130,4 +131,6 @@ struct vt_notifier_param {
 extern int register_vt_notifier(struct notifier_block *nb);
 extern int unregister_vt_notifier(struct notifier_block *nb);
 
+extern void hide_boot_cursor(bool hide);
+
 #endif /* _VT_KERN_H */
-- 
cgit v1.2.3


From 688bcaff291cf2fe2734e43f2793d4d05b850518 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 14 Nov 2009 01:12:47 +0100
Subject: hw-breakpoints: Fix build on !perf architectures

the arch/alpha build fails with:

 In file included from tip/kernel/exit.c:52:
 tip/include/linux/hw_breakpoint.h: In function 'hw_breakpoint_addr':
 tip/include/linux/hw_breakpoint.h:21: error: 'struct perf_event' has no member named 'attr'
 [...]

Move these helper inlines inside the CONFIG_HAVE_HW_BREAKPOINT ifdef.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Prasad <prasad@linux.vnet.ibm.com>
LKML-Reference: <1258114575-32655-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/hw_breakpoint.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index 18710e0c84bd..0b98cbf76da7 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -16,6 +16,8 @@ enum {
 	HW_BREAKPOINT_X = 4,
 };
 
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+
 static inline unsigned long hw_breakpoint_addr(struct perf_event *bp)
 {
 	return bp->attr.bp_addr;
@@ -31,7 +33,6 @@ static inline int hw_breakpoint_len(struct perf_event *bp)
 	return bp->attr.bp_len;
 }
 
-#ifdef CONFIG_HAVE_HW_BREAKPOINT
 extern struct perf_event *
 register_user_hw_breakpoint(unsigned long addr,
 			    int len,
-- 
cgit v1.2.3


From bee7ca9ec03a26676ea2b1c28dc4039348eff3e1 Mon Sep 17 00:00:00 2001
From: William Allen Simpson <william.allen.simpson@gmail.com>
Date: Tue, 10 Nov 2009 09:51:18 +0000
Subject: net: TCP_MSS_DEFAULT, TCP_MSS_DESIRED

Define two symbols needed in both kernel and user space.

Remove old (somewhat incorrect) kernel variant that wasn't used in
most cases.  Default should apply to both RMSS and SMSS (RFC2581).

Replace numeric constants with defined symbols.

Stand-alone patch, originally developed for TCPCT.

Signed-off-by: William.Allen.Simpson@gmail.com
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      | 6 ++++++
 include/net/tcp.h        | 3 ---
 net/ipv4/tcp_input.c     | 4 ++--
 net/ipv4/tcp_ipv4.c      | 6 +++---
 net/ipv4/tcp_minisocks.c | 2 +-
 net/ipv6/tcp_ipv6.c      | 2 +-
 6 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index eeecb8547a2a..32d7d77b4a01 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -81,6 +81,12 @@ enum {
 	TCP_DATA_OFFSET = __cpu_to_be32(0xF0000000)
 }; 
 
+/*
+ * TCP general constants
+ */
+#define TCP_MSS_DEFAULT		 536U	/* IPv4 (RFC1122, RFC2581) */
+#define TCP_MSS_DESIRED		1220U	/* IPv6 (tunneled), EDNS0 (RFC3226) */
+
 /* TCP socket options */
 #define TCP_NODELAY		1	/* Turn off Nagle's algorithm. */
 #define TCP_MAXSEG		2	/* Limit MSS */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index bf20f88fd033..325bfcf5c934 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -62,9 +62,6 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 /* Minimal accepted MSS. It is (60+60+8) - (20+20). */
 #define TCP_MIN_MSS		88U
 
-/* Minimal RCV_MSS. */
-#define TCP_MIN_RCVMSS		536U
-
 /* The least MTU to use for probing */
 #define TCP_BASE_MSS		512
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index be0c5bf7bfca..cc306ac6eb51 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -140,7 +140,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
 		 * "len" is invariant segment length, including TCP header.
 		 */
 		len += skb->data - skb_transport_header(skb);
-		if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) ||
+		if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
 		    /* If PSH is not set, packet should be
 		     * full sized, provided peer TCP is not badly broken.
 		     * This observation (if it is correct 8)) allows
@@ -411,7 +411,7 @@ void tcp_initialize_rcv_mss(struct sock *sk)
 	unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
 
 	hint = min(hint, tp->rcv_wnd / 2);
-	hint = min(hint, TCP_MIN_RCVMSS);
+	hint = min(hint, TCP_MSS_DEFAULT);
 	hint = max(hint, TCP_MIN_MSS);
 
 	inet_csk(sk)->icsk_ack.rcv_mss = hint;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 657ae334f125..cf7f2086e6e0 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -217,7 +217,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	if (inet->opt)
 		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
 
-	tp->rx_opt.mss_clamp = 536;
+	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 
 	/* Socket identity is still unknown (sport may be zero).
 	 * However we set state to SYN-SENT and not releasing socket
@@ -1268,7 +1268,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		goto drop_and_free;
 
 	tcp_clear_options(&tmp_opt);
-	tmp_opt.mss_clamp = 536;
+	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
 	tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
 
 	tcp_parse_options(skb, &tmp_opt, 0, dst);
@@ -1815,7 +1815,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 	 */
 	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 	tp->snd_cwnd_clamp = ~0;
-	tp->mss_cache = 536;
+	tp->mss_cache = TCP_MSS_DEFAULT;
 
 	tp->reordering = sysctl_tcp_reordering;
 	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index a9d34e224cb6..4be22280e6b3 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -476,7 +476,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		if (newtp->af_specific->md5_lookup(sk, newsk))
 			newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
 #endif
-		if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
+		if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
 			newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
 		newtp->rx_opt.mss_clamp = req->mss;
 		TCP_ECN_openreq_child(newtp, req);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 696a22f034e8..de709091b26d 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1851,7 +1851,7 @@ static int tcp_v6_init_sock(struct sock *sk)
 	 */
 	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 	tp->snd_cwnd_clamp = ~0;
-	tp->mss_cache = 536;
+	tp->mss_cache = TCP_MSS_DEFAULT;
 
 	tp->reordering = sysctl_tcp_reordering;
 
-- 
cgit v1.2.3


From ce81b76a39835a721cd168e0c0bcfe7132f1f66b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 11 Nov 2009 17:34:30 +0000
Subject: ipv6: use RCU to walk list of network devices

No longer need read_lock(&dev_base_lock), use RCU instead.
We also can avoid taking references on inet6_dev structs.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 10 ++++++++++
 net/ipv6/anycast.c        | 29 +++++++++++++--------------
 net/ipv6/mcast.c          | 51 +++++++++++++++++++++--------------------------
 3 files changed, 47 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8b266390b9e2..61425d0c6123 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1114,6 +1114,16 @@ static inline struct net_device *next_net_device(struct net_device *dev)
 	return lh == &net->dev_base_head ? NULL : net_device_entry(lh);
 }
 
+static inline struct net_device *next_net_device_rcu(struct net_device *dev)
+{
+	struct list_head *lh;
+	struct net *net;
+
+	net = dev_net(dev);
+	lh = rcu_dereference(dev->dev_list.next);
+	return lh == &net->dev_base_head ? NULL : net_device_entry(lh);
+}
+
 static inline struct net_device *first_net_device(struct net *net)
 {
 	return list_empty(&net->dev_base_head) ? NULL :
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 2f00ca83f049..f1c74c8ef9de 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -431,9 +431,9 @@ static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq)
 	struct net *net = seq_file_net(seq);
 
 	state->idev = NULL;
-	for_each_netdev(net, state->dev) {
+	for_each_netdev_rcu(net, state->dev) {
 		struct inet6_dev *idev;
-		idev = in6_dev_get(state->dev);
+		idev = __in6_dev_get(state->dev);
 		if (!idev)
 			continue;
 		read_lock_bh(&idev->lock);
@@ -443,7 +443,6 @@ static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq)
 			break;
 		}
 		read_unlock_bh(&idev->lock);
-		in6_dev_put(idev);
 	}
 	return im;
 }
@@ -454,16 +453,15 @@ static struct ifacaddr6 *ac6_get_next(struct seq_file *seq, struct ifacaddr6 *im
 
 	im = im->aca_next;
 	while (!im) {
-		if (likely(state->idev != NULL)) {
+		if (likely(state->idev != NULL))
 			read_unlock_bh(&state->idev->lock);
-			in6_dev_put(state->idev);
-		}
-		state->dev = next_net_device(state->dev);
+
+		state->dev = next_net_device_rcu(state->dev);
 		if (!state->dev) {
 			state->idev = NULL;
 			break;
 		}
-		state->idev = in6_dev_get(state->dev);
+		state->idev = __in6_dev_get(state->dev);
 		if (!state->idev)
 			continue;
 		read_lock_bh(&state->idev->lock);
@@ -482,29 +480,30 @@ static struct ifacaddr6 *ac6_get_idx(struct seq_file *seq, loff_t pos)
 }
 
 static void *ac6_seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(dev_base_lock)
+	__acquires(RCU)
 {
-	read_lock(&dev_base_lock);
+	rcu_read_lock();
 	return ac6_get_idx(seq, *pos);
 }
 
 static void *ac6_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct ifacaddr6 *im;
-	im = ac6_get_next(seq, v);
+	struct ifacaddr6 *im = ac6_get_next(seq, v);
+
 	++*pos;
 	return im;
 }
 
 static void ac6_seq_stop(struct seq_file *seq, void *v)
-	__releases(dev_base_lock)
+	__releases(RCU)
 {
 	struct ac6_iter_state *state = ac6_seq_private(seq);
+
 	if (likely(state->idev != NULL)) {
 		read_unlock_bh(&state->idev->lock);
-		in6_dev_put(state->idev);
+		state->idev = NULL;
 	}
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 }
 
 static int ac6_seq_show(struct seq_file *seq, void *v)
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index f9fcf690bd5d..1f9c44442e65 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -2375,9 +2375,9 @@ static inline struct ifmcaddr6 *igmp6_mc_get_first(struct seq_file *seq)
 	struct net *net = seq_file_net(seq);
 
 	state->idev = NULL;
-	for_each_netdev(net, state->dev) {
+	for_each_netdev_rcu(net, state->dev) {
 		struct inet6_dev *idev;
-		idev = in6_dev_get(state->dev);
+		idev = __in6_dev_get(state->dev);
 		if (!idev)
 			continue;
 		read_lock_bh(&idev->lock);
@@ -2387,7 +2387,6 @@ static inline struct ifmcaddr6 *igmp6_mc_get_first(struct seq_file *seq)
 			break;
 		}
 		read_unlock_bh(&idev->lock);
-		in6_dev_put(idev);
 	}
 	return im;
 }
@@ -2398,16 +2397,15 @@ static struct ifmcaddr6 *igmp6_mc_get_next(struct seq_file *seq, struct ifmcaddr
 
 	im = im->next;
 	while (!im) {
-		if (likely(state->idev != NULL)) {
+		if (likely(state->idev != NULL))
 			read_unlock_bh(&state->idev->lock);
-			in6_dev_put(state->idev);
-		}
-		state->dev = next_net_device(state->dev);
+
+		state->dev = next_net_device_rcu(state->dev);
 		if (!state->dev) {
 			state->idev = NULL;
 			break;
 		}
-		state->idev = in6_dev_get(state->dev);
+		state->idev = __in6_dev_get(state->dev);
 		if (!state->idev)
 			continue;
 		read_lock_bh(&state->idev->lock);
@@ -2426,31 +2424,31 @@ static struct ifmcaddr6 *igmp6_mc_get_idx(struct seq_file *seq, loff_t pos)
 }
 
 static void *igmp6_mc_seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(dev_base_lock)
+	__acquires(RCU)
 {
-	read_lock(&dev_base_lock);
+	rcu_read_lock();
 	return igmp6_mc_get_idx(seq, *pos);
 }
 
 static void *igmp6_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct ifmcaddr6 *im;
-	im = igmp6_mc_get_next(seq, v);
+	struct ifmcaddr6 *im = igmp6_mc_get_next(seq, v);
+
 	++*pos;
 	return im;
 }
 
 static void igmp6_mc_seq_stop(struct seq_file *seq, void *v)
-	__releases(dev_base_lock)
+	__releases(RCU)
 {
 	struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);
+
 	if (likely(state->idev != NULL)) {
 		read_unlock_bh(&state->idev->lock);
-		in6_dev_put(state->idev);
 		state->idev = NULL;
 	}
 	state->dev = NULL;
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 }
 
 static int igmp6_mc_seq_show(struct seq_file *seq, void *v)
@@ -2507,9 +2505,9 @@ static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq)
 
 	state->idev = NULL;
 	state->im = NULL;
-	for_each_netdev(net, state->dev) {
+	for_each_netdev_rcu(net, state->dev) {
 		struct inet6_dev *idev;
-		idev = in6_dev_get(state->dev);
+		idev = __in6_dev_get(state->dev);
 		if (unlikely(idev == NULL))
 			continue;
 		read_lock_bh(&idev->lock);
@@ -2525,7 +2523,6 @@ static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq)
 			spin_unlock_bh(&im->mca_lock);
 		}
 		read_unlock_bh(&idev->lock);
-		in6_dev_put(idev);
 	}
 	return psf;
 }
@@ -2539,16 +2536,15 @@ static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_s
 		spin_unlock_bh(&state->im->mca_lock);
 		state->im = state->im->next;
 		while (!state->im) {
-			if (likely(state->idev != NULL)) {
+			if (likely(state->idev != NULL))
 				read_unlock_bh(&state->idev->lock);
-				in6_dev_put(state->idev);
-			}
-			state->dev = next_net_device(state->dev);
+
+			state->dev = next_net_device_rcu(state->dev);
 			if (!state->dev) {
 				state->idev = NULL;
 				goto out;
 			}
-			state->idev = in6_dev_get(state->dev);
+			state->idev = __in6_dev_get(state->dev);
 			if (!state->idev)
 				continue;
 			read_lock_bh(&state->idev->lock);
@@ -2573,9 +2569,9 @@ static struct ip6_sf_list *igmp6_mcf_get_idx(struct seq_file *seq, loff_t pos)
 }
 
 static void *igmp6_mcf_seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(dev_base_lock)
+	__acquires(RCU)
 {
-	read_lock(&dev_base_lock);
+	rcu_read_lock();
 	return *pos ? igmp6_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 }
 
@@ -2591,7 +2587,7 @@ static void *igmp6_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 }
 
 static void igmp6_mcf_seq_stop(struct seq_file *seq, void *v)
-	__releases(dev_base_lock)
+	__releases(RCU)
 {
 	struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);
 	if (likely(state->im != NULL)) {
@@ -2600,11 +2596,10 @@ static void igmp6_mcf_seq_stop(struct seq_file *seq, void *v)
 	}
 	if (likely(state->idev != NULL)) {
 		read_unlock_bh(&state->idev->lock);
-		in6_dev_put(state->idev);
 		state->idev = NULL;
 	}
 	state->dev = NULL;
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 }
 
 static int igmp6_mcf_seq_show(struct seq_file *seq, void *v)
-- 
cgit v1.2.3


From 4c49b12853fbb5eff4849b7b6a1e895776f027a1 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Fri, 13 Nov 2009 21:47:33 -0800
Subject: perf_event: Fix invalid type in ioctl definition

u64 is invalid in userspace headers, including ioctl
definitions; use __u64 instead

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: <stable@kernel.org>
LKML-Reference: <20091113214733.7cd76be9@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 45b56faf5cdc..ec3768a81058 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -219,7 +219,7 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_DISABLE		_IO ('$', 1)
 #define PERF_EVENT_IOC_REFRESH		_IO ('$', 2)
 #define PERF_EVENT_IOC_RESET		_IO ('$', 3)
-#define PERF_EVENT_IOC_PERIOD		_IOW('$', 4, u64)
+#define PERF_EVENT_IOC_PERIOD		_IOW('$', 4, __u64)
 #define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
 #define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
 
-- 
cgit v1.2.3


From 6959450e567c1f17d3ce8489099fc56c3721d577 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Sat, 14 Nov 2009 20:46:38 +0900
Subject: swiotlb: Remove duplicate swiotlb_force extern declarations

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: tony.luck@intel.com
LKML-Reference: <1258199198-16657-4-git-send-email-fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/ia64/include/asm/swiotlb.h | 2 --
 arch/x86/include/asm/swiotlb.h  | 4 ----
 include/linux/swiotlb.h         | 2 ++
 3 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/swiotlb.h b/arch/ia64/include/asm/swiotlb.h
index dcbaea7ce128..f0acde68aaea 100644
--- a/arch/ia64/include/asm/swiotlb.h
+++ b/arch/ia64/include/asm/swiotlb.h
@@ -4,8 +4,6 @@
 #include <linux/dma-mapping.h>
 #include <linux/swiotlb.h>
 
-extern int swiotlb_force;
-
 #ifdef CONFIG_SWIOTLB
 extern int swiotlb;
 extern void pci_swiotlb_init(void);
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
index 940f13a213f8..87ffcb12a1b8 100644
--- a/arch/x86/include/asm/swiotlb.h
+++ b/arch/x86/include/asm/swiotlb.h
@@ -3,10 +3,6 @@
 
 #include <linux/swiotlb.h>
 
-/* SWIOTLB interface */
-
-extern int swiotlb_force;
-
 #ifdef CONFIG_SWIOTLB
 extern int swiotlb;
 extern int pci_swiotlb_init(void);
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index eb9bdb4d4854..febedcf67c7e 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -7,6 +7,8 @@ struct device;
 struct dma_attrs;
 struct scatterlist;
 
+extern int swiotlb_force;
+
 /*
  * Maximum allowable number of contiguous slabs to map,
  * must be a power of 2.  What is the appropriate value ?
-- 
cgit v1.2.3


From 9a1654ba0b50402a6bd03c7b0fe9b0200a5ea7b1 Mon Sep 17 00:00:00 2001
From: Jarek Poplawski <jarkao2@gmail.com>
Date: Sun, 15 Nov 2009 07:20:12 +0000
Subject: net: Optimize hard_start_xmit() return checking

Recent changes in the TX error propagation require additional checking
and masking of values returned from hard_start_xmit(), mainly to
separate cases where skb was consumed. This aim can be simplified by
changing the order of NETDEV_TX and NET_XMIT codes, because the latter
are treated similarly to negative (ERRNO) values.

After this change much simpler dev_xmit_complete() is also used in
sch_direct_xmit(), so it is moved to netdevice.h.

Additionally NET_RX definitions in netdevice.h are moved up from
between TX codes to avoid confusion while reading the TX comment.

Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 42 ++++++++++++++++++++++++++++++------------
 net/core/dev.c            | 17 -----------------
 net/sched/sch_generic.c   | 23 +++++------------------
 3 files changed, 35 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 61425d0c6123..7043f85e643d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -63,6 +63,10 @@ struct wireless_dev;
 #define HAVE_FREE_NETDEV		/* free_netdev() */
 #define HAVE_NETDEV_PRIV		/* netdev_priv() */
 
+/* Backlog congestion levels */
+#define NET_RX_SUCCESS		0	/* keep 'em coming, baby */
+#define NET_RX_DROP		1	/* packet dropped */
+
 /*
  * Transmit return codes: transmit return codes originate from three different
  * namespaces:
@@ -82,14 +86,10 @@ struct wireless_dev;
 
 /* qdisc ->enqueue() return codes. */
 #define NET_XMIT_SUCCESS	0x00
-#define NET_XMIT_DROP		0x10	/* skb dropped			*/
-#define NET_XMIT_CN		0x20	/* congestion notification	*/
-#define NET_XMIT_POLICED	0x30	/* skb is shot by police	*/
-#define NET_XMIT_MASK		0xf0	/* qdisc flags in net/sch_generic.h */
-
-/* Backlog congestion levels */
-#define NET_RX_SUCCESS		0	/* keep 'em coming, baby */
-#define NET_RX_DROP		1	/* packet dropped */
+#define NET_XMIT_DROP		0x01	/* skb dropped			*/
+#define NET_XMIT_CN		0x02	/* congestion notification	*/
+#define NET_XMIT_POLICED	0x03	/* skb is shot by police	*/
+#define NET_XMIT_MASK		0x0f	/* qdisc flags in net/sch_generic.h */
 
 /* NET_XMIT_CN is special. It does not guarantee that this packet is lost. It
  * indicates that the device will soon be dropping packets, or already drops
@@ -98,16 +98,34 @@ struct wireless_dev;
 #define net_xmit_errno(e)	((e) != NET_XMIT_CN ? -ENOBUFS : 0)
 
 /* Driver transmit return codes */
-#define NETDEV_TX_MASK		0xf
+#define NETDEV_TX_MASK		0xf0
 
 enum netdev_tx {
 	__NETDEV_TX_MIN	 = INT_MIN,	/* make sure enum is signed */
-	NETDEV_TX_OK	 = 0,		/* driver took care of packet */
-	NETDEV_TX_BUSY	 = 1,		/* driver tx path was busy*/
-	NETDEV_TX_LOCKED = 2,		/* driver tx lock was already taken */
+	NETDEV_TX_OK	 = 0x00,	/* driver took care of packet */
+	NETDEV_TX_BUSY	 = 0x10,	/* driver tx path was busy*/
+	NETDEV_TX_LOCKED = 0x20,	/* driver tx lock was already taken */
 };
 typedef enum netdev_tx netdev_tx_t;
 
+/*
+ * Current order: NETDEV_TX_MASK > NET_XMIT_MASK >= 0 is significant;
+ * hard_start_xmit() return < NET_XMIT_MASK means skb was consumed.
+ */
+static inline bool dev_xmit_complete(int rc)
+{
+	/*
+	 * Positive cases with an skb consumed by a driver:
+	 * - successful transmission (rc == NETDEV_TX_OK)
+	 * - error while transmitting (rc < 0)
+	 * - error while queueing to a different device (rc & NET_XMIT_MASK)
+	 */
+	if (likely(rc < NET_XMIT_MASK))
+		return true;
+
+	return false;
+}
+
 #endif
 
 #define MAX_ADDR_LEN	32		/* Largest hardware address length */
diff --git a/net/core/dev.c b/net/core/dev.c
index 32045df1da5c..4b24d79414e3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1924,23 +1924,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 	return rc;
 }
 
-static inline bool dev_xmit_complete(int rc)
-{
-	/* successful transmission */
-	if (rc == NETDEV_TX_OK)
-		return true;
-
-	/* error while transmitting, driver consumed skb */
-	if (rc < 0)
-		return true;
-
-	/* error while queueing to a different device, driver consumed skb */
-	if (rc & NET_XMIT_MASK)
-		return true;
-
-	return false;
-}
-
 /**
  *	dev_queue_xmit - transmit a buffer
  *	@skb: buffer to transmit
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index b13821ad2fb6..5173c1e1b19c 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -119,39 +119,26 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 	spin_unlock(root_lock);
 
 	HARD_TX_LOCK(dev, txq, smp_processor_id());
-	if (!netif_tx_queue_stopped(txq) &&
-	    !netif_tx_queue_frozen(txq)) {
+	if (!netif_tx_queue_stopped(txq) && !netif_tx_queue_frozen(txq))
 		ret = dev_hard_start_xmit(skb, dev, txq);
 
-		/* an error implies that the skb was consumed */
-		if (ret < 0)
-			ret = NETDEV_TX_OK;
-		/* all NET_XMIT codes map to NETDEV_TX_OK */
-		ret &= ~NET_XMIT_MASK;
-	}
 	HARD_TX_UNLOCK(dev, txq);
 
 	spin_lock(root_lock);
 
-	switch (ret) {
-	case NETDEV_TX_OK:
-		/* Driver sent out skb successfully */
+	if (dev_xmit_complete(ret)) {
+		/* Driver sent out skb successfully or skb was consumed */
 		ret = qdisc_qlen(q);
-		break;
-
-	case NETDEV_TX_LOCKED:
+	} else if (ret == NETDEV_TX_LOCKED) {
 		/* Driver try lock failed */
 		ret = handle_dev_cpu_collision(skb, txq, q);
-		break;
-
-	default:
+	} else {
 		/* Driver returned NETDEV_TX_BUSY - requeue skb */
 		if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))
 			printk(KERN_WARNING "BUG %s code %d qlen %d\n",
 			       dev->name, ret, q->q.qlen);
 
 		ret = dev_requeue_skb(skb, q);
-		break;
 	}
 
 	if (ret && (netif_tx_queue_stopped(txq) ||
-- 
cgit v1.2.3


From b9f5d52670c27e71f04c466aee77e3a2eeca8080 Mon Sep 17 00:00:00 2001
From: Marin Mitov <mitov@issp.bas.bg>
Date: Fri, 13 Nov 2009 07:58:41 +0000
Subject: remove deprecated and not used: print_mac()

The function print_mac in net/ethernet/eth.c is marked __deprecated
and not used. Remove it.

Signed-off-by: Marin Mitov <mitov@issp.bas.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_ether.h | 4 ----
 net/ethernet/eth.c       | 7 -------
 2 files changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h
index 580b6004d00e..005e1525ab86 100644
--- a/include/linux/if_ether.h
+++ b/include/linux/if_ether.h
@@ -136,10 +136,6 @@ extern struct ctl_table ether_table[];
 
 extern ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len);
 
-/*
- *	Display a 6 byte device address (MAC) in a readable format.
- */
-extern char *print_mac(char *buf, const unsigned char *addr) __deprecated;
 #define MAC_FMT "%02x:%02x:%02x:%02x:%02x:%02x"
 #define MAC_BUF_SIZE	18
 #define DECLARE_MAC_BUF(var) char var[MAC_BUF_SIZE]
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 5a883affecd3..dd3db88f8f0a 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -393,10 +393,3 @@ ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
 	return ((ssize_t) l);
 }
 EXPORT_SYMBOL(sysfs_format_mac);
-
-char *print_mac(char *buf, const unsigned char *addr)
-{
-	_format_mac_addr(buf, MAC_BUF_SIZE, addr, ETH_ALEN);
-	return buf;
-}
-EXPORT_SYMBOL(print_mac);
-- 
cgit v1.2.3


From 559fdc3c1b624edb1933a875022fe7e27934d11c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 16 Nov 2009 12:45:14 +0100
Subject: perf_event: Optimize perf_output_lock()

The purpose of perf_output_{un,}lock() is to:

 1) avoid publishing incomplete data
    [ possible when publishing a head that is ahead of an entry
      that is still being written ]

 2) guarantee fwd progress
    [ a simple refcount on pending writers doesn't need to drop to
      0, making it so would end up implementing something like forced
      quiecent states of RCU ]

To satisfy the above without undue complexity it serializes
between CPUs, this means that a pending writer can only be the
same cpu in a nested context, and since (under normal operation)
a cpu always makes progress we're good -- if the head is only
published when the bottom  most writer completes.

Now we don't need to disable IRQs in order to serialize between
CPUs, disabling preemption ought to be sufficient, esp since we
already deal with nesting due to NMIs.

This avoids potentially expensive (and needless) local IRQ
disable/enable ops.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1258373161.26714.254.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h |  1 -
 kernel/perf_event.c        | 21 +++++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index df4e73e33774..7f87563c8485 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -714,7 +714,6 @@ struct perf_output_handle {
 	int				nmi;
 	int				sample;
 	int				locked;
-	unsigned long			flags;
 };
 
 #ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 6f4ed3b4cd73..3256e36ad251 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2674,20 +2674,21 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
 static void perf_output_lock(struct perf_output_handle *handle)
 {
 	struct perf_mmap_data *data = handle->data;
-	int cpu;
+	int cur, cpu = get_cpu();
 
 	handle->locked = 0;
 
-	local_irq_save(handle->flags);
-	cpu = smp_processor_id();
-
-	if (in_nmi() && atomic_read(&data->lock) == cpu)
-		return;
+	for (;;) {
+		cur = atomic_cmpxchg(&data->lock, -1, cpu);
+		if (cur == -1) {
+			handle->locked = 1;
+			break;
+		}
+		if (cur == cpu)
+			break;
 
-	while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
 		cpu_relax();
-
-	handle->locked = 1;
+	}
 }
 
 static void perf_output_unlock(struct perf_output_handle *handle)
@@ -2733,7 +2734,7 @@ again:
 	if (atomic_xchg(&data->wakeup, 0))
 		perf_output_wakeup(handle);
 out:
-	local_irq_restore(handle->flags);
+	put_cpu();
 }
 
 void perf_output_copy(struct perf_output_handle *handle,
-- 
cgit v1.2.3


From c85e9d7739fc8d879c4293ea020760926d6f87cd Mon Sep 17 00:00:00 2001
From: "John W. Linville" <linville@tuxdriver.com>
Date: Tue, 17 Nov 2009 10:16:32 -0500
Subject: znet: fix build failure from i82593.h relocation

znet was including "wireless/i82593.h" (which is a bit wierd), and I
missed that when I relocated i82593.h to drivers/staging/wavelan.  Since
I don't have ISA turned-on in my normal .config, I didn't see the build
failures -- mea culpa!

Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/znet.c                     |   3 +-
 drivers/staging/wavelan/i82593.h       | 229 ---------------------------------
 drivers/staging/wavelan/wavelan_cs.p.h |   2 +-
 include/linux/i82593.h                 | 229 +++++++++++++++++++++++++++++++++
 4 files changed, 231 insertions(+), 232 deletions(-)
 delete mode 100644 drivers/staging/wavelan/i82593.h
 create mode 100644 include/linux/i82593.h

(limited to 'include/linux')

diff --git a/drivers/net/znet.c b/drivers/net/znet.c
index b42347333750..443c4eee28c1 100644
--- a/drivers/net/znet.c
+++ b/drivers/net/znet.c
@@ -103,8 +103,7 @@
 #include <asm/io.h>
 #include <asm/dma.h>
 
-/* This include could be elsewhere, since it is not wireless specific */
-#include "wireless/i82593.h"
+#include <linux/i82593.h>
 
 static char version[] __initdata = "znet.c:v1.02 9/23/94 becker@scyld.com\n";
 
diff --git a/drivers/staging/wavelan/i82593.h b/drivers/staging/wavelan/i82593.h
deleted file mode 100644
index afac5c7a323d..000000000000
--- a/drivers/staging/wavelan/i82593.h
+++ /dev/null
@@ -1,229 +0,0 @@
-/*
- * Definitions for Intel 82593 CSMA/CD Core LAN Controller
- * The definitions are taken from the 1992 users manual with Intel
- * order number 297125-001.
- *
- * /usr/src/pc/RCS/i82593.h,v 1.1 1996/07/17 15:23:12 root Exp
- *
- * Copyright 1994, Anders Klemets <klemets@it.kth.se>
- *
- * HISTORY
- * i82593.h,v
- * Revision 1.4  2005/11/4  09:15:00  baroniunas
- * Modified copyright with permission of author as follows:
- *
- *   "If I82539.H is the only file with my copyright statement
- *    that is included in the Source Forge project, then you have
- *    my approval to change the copyright statement to be a GPL
- *    license, in the way you proposed on October 10."
- *
- * Revision 1.1  1996/07/17 15:23:12  root
- * Initial revision
- *
- * Revision 1.3  1995/04/05  15:13:58  adj
- * Initial alpha release
- *
- * Revision 1.2  1994/06/16  23:57:31  klemets
- * Mirrored all the fields in the configuration block.
- *
- * Revision 1.1  1994/06/02  20:25:34  klemets
- * Initial revision
- *
- *
- */
-#ifndef	_I82593_H
-#define	_I82593_H
-
-/* Intel 82593 CSMA/CD Core LAN Controller */
-
-/* Port 0 Command Register definitions */
-
-/* Execution operations */
-#define OP0_NOP			0	/* CHNL = 0 */
-#define OP0_SWIT_TO_PORT_1	0	/* CHNL = 1 */
-#define OP0_IA_SETUP		1
-#define OP0_CONFIGURE		2
-#define OP0_MC_SETUP		3
-#define OP0_TRANSMIT		4
-#define OP0_TDR			5
-#define OP0_DUMP		6
-#define OP0_DIAGNOSE		7
-#define OP0_TRANSMIT_NO_CRC	9
-#define OP0_RETRANSMIT		12
-#define OP0_ABORT		13
-/* Reception operations */
-#define OP0_RCV_ENABLE		8
-#define OP0_RCV_DISABLE		10
-#define OP0_STOP_RCV		11
-/* Status pointer control operations */
-#define OP0_FIX_PTR		15	/* CHNL = 1 */
-#define OP0_RLS_PTR		15	/* CHNL = 0 */
-#define OP0_RESET		14
-
-#define CR0_CHNL		(1 << 4)	/* 0=Channel 0, 1=Channel 1 */
-#define CR0_STATUS_0		0x00
-#define CR0_STATUS_1		0x20
-#define CR0_STATUS_2		0x40
-#define CR0_STATUS_3		0x60
-#define CR0_INT_ACK		(1 << 7)	/* 0=No ack, 1=acknowledge */
-
-/* Port 0 Status Register definitions */
-
-#define SR0_NO_RESULT		0		/* dummy */
-#define SR0_EVENT_MASK		0x0f
-#define SR0_IA_SETUP_DONE	1
-#define SR0_CONFIGURE_DONE	2
-#define SR0_MC_SETUP_DONE	3
-#define SR0_TRANSMIT_DONE	4
-#define SR0_TDR_DONE		5
-#define SR0_DUMP_DONE		6
-#define SR0_DIAGNOSE_PASSED	7
-#define SR0_TRANSMIT_NO_CRC_DONE 9
-#define SR0_RETRANSMIT_DONE	12
-#define SR0_EXECUTION_ABORTED	13
-#define SR0_END_OF_FRAME	8
-#define SR0_RECEPTION_ABORTED	10
-#define SR0_DIAGNOSE_FAILED	15
-#define SR0_STOP_REG_HIT	11
-
-#define SR0_CHNL		(1 << 4)
-#define SR0_EXECUTION		(1 << 5)
-#define SR0_RECEPTION		(1 << 6)
-#define SR0_INTERRUPT		(1 << 7)
-#define SR0_BOTH_RX_TX		(SR0_EXECUTION | SR0_RECEPTION)
-
-#define SR3_EXEC_STATE_MASK	0x03
-#define SR3_EXEC_IDLE		0
-#define SR3_TX_ABORT_IN_PROGRESS 1
-#define SR3_EXEC_ACTIVE		2
-#define SR3_ABORT_IN_PROGRESS	3
-#define SR3_EXEC_CHNL		(1 << 2)
-#define SR3_STP_ON_NO_RSRC	(1 << 3)
-#define SR3_RCVING_NO_RSRC	(1 << 4)
-#define SR3_RCV_STATE_MASK	0x60
-#define SR3_RCV_IDLE		0x00
-#define SR3_RCV_READY		0x20
-#define SR3_RCV_ACTIVE		0x40
-#define SR3_RCV_STOP_IN_PROG	0x60
-#define SR3_RCV_CHNL		(1 << 7)
-
-/* Port 1 Command Register definitions */
-
-#define OP1_NOP			0
-#define OP1_SWIT_TO_PORT_0	1
-#define OP1_INT_DISABLE		2
-#define OP1_INT_ENABLE		3
-#define OP1_SET_TS		5
-#define OP1_RST_TS		7
-#define OP1_POWER_DOWN		8
-#define OP1_RESET_RING_MNGMT	11
-#define OP1_RESET		14
-#define OP1_SEL_RST		15
-
-#define CR1_STATUS_4		0x00
-#define CR1_STATUS_5		0x20
-#define CR1_STATUS_6		0x40
-#define CR1_STOP_REG_UPDATE	(1 << 7)
-
-/* Receive frame status bits */
-
-#define	RX_RCLD			(1 << 0)
-#define RX_IA_MATCH		(1 << 1)
-#define	RX_NO_AD_MATCH		(1 << 2)
-#define RX_NO_SFD		(1 << 3)
-#define RX_SRT_FRM		(1 << 7)
-#define RX_OVRRUN		(1 << 8)
-#define RX_ALG_ERR		(1 << 10)
-#define RX_CRC_ERR		(1 << 11)
-#define RX_LEN_ERR		(1 << 12)
-#define RX_RCV_OK		(1 << 13)
-#define RX_TYP_LEN		(1 << 15)
-
-/* Transmit status bits */
-
-#define TX_NCOL_MASK		0x0f
-#define TX_FRTL			(1 << 4)
-#define TX_MAX_COL		(1 << 5)
-#define TX_HRT_BEAT		(1 << 6)
-#define TX_DEFER		(1 << 7)
-#define TX_UND_RUN		(1 << 8)
-#define TX_LOST_CTS		(1 << 9)
-#define TX_LOST_CRS		(1 << 10)
-#define TX_LTCOL		(1 << 11)
-#define TX_OK			(1 << 13)
-#define TX_COLL			(1 << 15)
-
-struct i82593_conf_block {
-  u_char fifo_limit : 4,
-  	 forgnesi   : 1,
-  	 fifo_32    : 1,
-  	 d6mod      : 1,
-  	 throttle_enb : 1;
-  u_char throttle   : 6,
-	 cntrxint   : 1,
-	 contin	    : 1;
-  u_char addr_len   : 3,
-  	 acloc 	    : 1,
- 	 preamb_len : 2,
-  	 loopback   : 2;
-  u_char lin_prio   : 3,
-	 tbofstop   : 1,
-	 exp_prio   : 3,
-	 bof_met    : 1;
-  u_char	    : 4,
-	 ifrm_spc   : 4;
-  u_char	    : 5,
-	 slottim_low : 3;
-  u_char slottim_hi : 3,
-		    : 1,
-	 max_retr   : 4;
-  u_char prmisc     : 1,
-	 bc_dis     : 1,
-  		    : 1,
-	 crs_1	    : 1,
-	 nocrc_ins  : 1,
-	 crc_1632   : 1,
-  	 	    : 1,
-  	 crs_cdt    : 1;
-  u_char cs_filter  : 3,
-	 crs_src    : 1,
-	 cd_filter  : 3,
-		    : 1;
-  u_char	    : 2,
-  	 min_fr_len : 6;
-  u_char lng_typ    : 1,
-	 lng_fld    : 1,
-	 rxcrc_xf   : 1,
-	 artx	    : 1,
-	 sarec	    : 1,
-	 tx_jabber  : 1,	/* why is this called max_len in the manual? */
-	 hash_1	    : 1,
-  	 lbpkpol    : 1;
-  u_char	    : 6,
-  	 fdx	    : 1,
-  	  	    : 1;
-  u_char dummy_6    : 6,	/* supposed to be ones */
-  	 mult_ia    : 1,
-  	 dis_bof    : 1;
-  u_char dummy_1    : 1,	/* supposed to be one */
-	 tx_ifs_retrig : 2,
-	 mc_all     : 1,
-	 rcv_mon    : 2,
-	 frag_acpt  : 1,
-  	 tstrttrs   : 1;
-  u_char fretx	    : 1,
-	 runt_eop   : 1,
-	 hw_sw_pin  : 1,
-	 big_endn   : 1,
-	 syncrqs    : 1,
-	 sttlen     : 1,
-	 tx_eop     : 1,
-  	 rx_eop	    : 1;
-  u_char rbuf_size  : 5,
-	 rcvstop    : 1,
-  	 	    : 2;
-};
-
-#define I82593_MAX_MULTICAST_ADDRESSES	128	/* Hardware hashed filter */
-
-#endif /* _I82593_H */
diff --git a/drivers/staging/wavelan/wavelan_cs.p.h b/drivers/staging/wavelan/wavelan_cs.p.h
index 81d91531c4f9..8fbfaa8a5a67 100644
--- a/drivers/staging/wavelan/wavelan_cs.p.h
+++ b/drivers/staging/wavelan/wavelan_cs.p.h
@@ -446,7 +446,7 @@
 #include <pcmcia/ds.h>
 
 /* Wavelan declarations */
-#include "i82593.h"	/* Definitions for the Intel chip */
+#include <linux/i82593.h>	/* Definitions for the Intel chip */
 
 #include "wavelan_cs.h"	/* Others bits of the hardware */
 
diff --git a/include/linux/i82593.h b/include/linux/i82593.h
new file mode 100644
index 000000000000..afac5c7a323d
--- /dev/null
+++ b/include/linux/i82593.h
@@ -0,0 +1,229 @@
+/*
+ * Definitions for Intel 82593 CSMA/CD Core LAN Controller
+ * The definitions are taken from the 1992 users manual with Intel
+ * order number 297125-001.
+ *
+ * /usr/src/pc/RCS/i82593.h,v 1.1 1996/07/17 15:23:12 root Exp
+ *
+ * Copyright 1994, Anders Klemets <klemets@it.kth.se>
+ *
+ * HISTORY
+ * i82593.h,v
+ * Revision 1.4  2005/11/4  09:15:00  baroniunas
+ * Modified copyright with permission of author as follows:
+ *
+ *   "If I82539.H is the only file with my copyright statement
+ *    that is included in the Source Forge project, then you have
+ *    my approval to change the copyright statement to be a GPL
+ *    license, in the way you proposed on October 10."
+ *
+ * Revision 1.1  1996/07/17 15:23:12  root
+ * Initial revision
+ *
+ * Revision 1.3  1995/04/05  15:13:58  adj
+ * Initial alpha release
+ *
+ * Revision 1.2  1994/06/16  23:57:31  klemets
+ * Mirrored all the fields in the configuration block.
+ *
+ * Revision 1.1  1994/06/02  20:25:34  klemets
+ * Initial revision
+ *
+ *
+ */
+#ifndef	_I82593_H
+#define	_I82593_H
+
+/* Intel 82593 CSMA/CD Core LAN Controller */
+
+/* Port 0 Command Register definitions */
+
+/* Execution operations */
+#define OP0_NOP			0	/* CHNL = 0 */
+#define OP0_SWIT_TO_PORT_1	0	/* CHNL = 1 */
+#define OP0_IA_SETUP		1
+#define OP0_CONFIGURE		2
+#define OP0_MC_SETUP		3
+#define OP0_TRANSMIT		4
+#define OP0_TDR			5
+#define OP0_DUMP		6
+#define OP0_DIAGNOSE		7
+#define OP0_TRANSMIT_NO_CRC	9
+#define OP0_RETRANSMIT		12
+#define OP0_ABORT		13
+/* Reception operations */
+#define OP0_RCV_ENABLE		8
+#define OP0_RCV_DISABLE		10
+#define OP0_STOP_RCV		11
+/* Status pointer control operations */
+#define OP0_FIX_PTR		15	/* CHNL = 1 */
+#define OP0_RLS_PTR		15	/* CHNL = 0 */
+#define OP0_RESET		14
+
+#define CR0_CHNL		(1 << 4)	/* 0=Channel 0, 1=Channel 1 */
+#define CR0_STATUS_0		0x00
+#define CR0_STATUS_1		0x20
+#define CR0_STATUS_2		0x40
+#define CR0_STATUS_3		0x60
+#define CR0_INT_ACK		(1 << 7)	/* 0=No ack, 1=acknowledge */
+
+/* Port 0 Status Register definitions */
+
+#define SR0_NO_RESULT		0		/* dummy */
+#define SR0_EVENT_MASK		0x0f
+#define SR0_IA_SETUP_DONE	1
+#define SR0_CONFIGURE_DONE	2
+#define SR0_MC_SETUP_DONE	3
+#define SR0_TRANSMIT_DONE	4
+#define SR0_TDR_DONE		5
+#define SR0_DUMP_DONE		6
+#define SR0_DIAGNOSE_PASSED	7
+#define SR0_TRANSMIT_NO_CRC_DONE 9
+#define SR0_RETRANSMIT_DONE	12
+#define SR0_EXECUTION_ABORTED	13
+#define SR0_END_OF_FRAME	8
+#define SR0_RECEPTION_ABORTED	10
+#define SR0_DIAGNOSE_FAILED	15
+#define SR0_STOP_REG_HIT	11
+
+#define SR0_CHNL		(1 << 4)
+#define SR0_EXECUTION		(1 << 5)
+#define SR0_RECEPTION		(1 << 6)
+#define SR0_INTERRUPT		(1 << 7)
+#define SR0_BOTH_RX_TX		(SR0_EXECUTION | SR0_RECEPTION)
+
+#define SR3_EXEC_STATE_MASK	0x03
+#define SR3_EXEC_IDLE		0
+#define SR3_TX_ABORT_IN_PROGRESS 1
+#define SR3_EXEC_ACTIVE		2
+#define SR3_ABORT_IN_PROGRESS	3
+#define SR3_EXEC_CHNL		(1 << 2)
+#define SR3_STP_ON_NO_RSRC	(1 << 3)
+#define SR3_RCVING_NO_RSRC	(1 << 4)
+#define SR3_RCV_STATE_MASK	0x60
+#define SR3_RCV_IDLE		0x00
+#define SR3_RCV_READY		0x20
+#define SR3_RCV_ACTIVE		0x40
+#define SR3_RCV_STOP_IN_PROG	0x60
+#define SR3_RCV_CHNL		(1 << 7)
+
+/* Port 1 Command Register definitions */
+
+#define OP1_NOP			0
+#define OP1_SWIT_TO_PORT_0	1
+#define OP1_INT_DISABLE		2
+#define OP1_INT_ENABLE		3
+#define OP1_SET_TS		5
+#define OP1_RST_TS		7
+#define OP1_POWER_DOWN		8
+#define OP1_RESET_RING_MNGMT	11
+#define OP1_RESET		14
+#define OP1_SEL_RST		15
+
+#define CR1_STATUS_4		0x00
+#define CR1_STATUS_5		0x20
+#define CR1_STATUS_6		0x40
+#define CR1_STOP_REG_UPDATE	(1 << 7)
+
+/* Receive frame status bits */
+
+#define	RX_RCLD			(1 << 0)
+#define RX_IA_MATCH		(1 << 1)
+#define	RX_NO_AD_MATCH		(1 << 2)
+#define RX_NO_SFD		(1 << 3)
+#define RX_SRT_FRM		(1 << 7)
+#define RX_OVRRUN		(1 << 8)
+#define RX_ALG_ERR		(1 << 10)
+#define RX_CRC_ERR		(1 << 11)
+#define RX_LEN_ERR		(1 << 12)
+#define RX_RCV_OK		(1 << 13)
+#define RX_TYP_LEN		(1 << 15)
+
+/* Transmit status bits */
+
+#define TX_NCOL_MASK		0x0f
+#define TX_FRTL			(1 << 4)
+#define TX_MAX_COL		(1 << 5)
+#define TX_HRT_BEAT		(1 << 6)
+#define TX_DEFER		(1 << 7)
+#define TX_UND_RUN		(1 << 8)
+#define TX_LOST_CTS		(1 << 9)
+#define TX_LOST_CRS		(1 << 10)
+#define TX_LTCOL		(1 << 11)
+#define TX_OK			(1 << 13)
+#define TX_COLL			(1 << 15)
+
+struct i82593_conf_block {
+  u_char fifo_limit : 4,
+  	 forgnesi   : 1,
+  	 fifo_32    : 1,
+  	 d6mod      : 1,
+  	 throttle_enb : 1;
+  u_char throttle   : 6,
+	 cntrxint   : 1,
+	 contin	    : 1;
+  u_char addr_len   : 3,
+  	 acloc 	    : 1,
+ 	 preamb_len : 2,
+  	 loopback   : 2;
+  u_char lin_prio   : 3,
+	 tbofstop   : 1,
+	 exp_prio   : 3,
+	 bof_met    : 1;
+  u_char	    : 4,
+	 ifrm_spc   : 4;
+  u_char	    : 5,
+	 slottim_low : 3;
+  u_char slottim_hi : 3,
+		    : 1,
+	 max_retr   : 4;
+  u_char prmisc     : 1,
+	 bc_dis     : 1,
+  		    : 1,
+	 crs_1	    : 1,
+	 nocrc_ins  : 1,
+	 crc_1632   : 1,
+  	 	    : 1,
+  	 crs_cdt    : 1;
+  u_char cs_filter  : 3,
+	 crs_src    : 1,
+	 cd_filter  : 3,
+		    : 1;
+  u_char	    : 2,
+  	 min_fr_len : 6;
+  u_char lng_typ    : 1,
+	 lng_fld    : 1,
+	 rxcrc_xf   : 1,
+	 artx	    : 1,
+	 sarec	    : 1,
+	 tx_jabber  : 1,	/* why is this called max_len in the manual? */
+	 hash_1	    : 1,
+  	 lbpkpol    : 1;
+  u_char	    : 6,
+  	 fdx	    : 1,
+  	  	    : 1;
+  u_char dummy_6    : 6,	/* supposed to be ones */
+  	 mult_ia    : 1,
+  	 dis_bof    : 1;
+  u_char dummy_1    : 1,	/* supposed to be one */
+	 tx_ifs_retrig : 2,
+	 mc_all     : 1,
+	 rcv_mon    : 2,
+	 frag_acpt  : 1,
+  	 tstrttrs   : 1;
+  u_char fretx	    : 1,
+	 runt_eop   : 1,
+	 hw_sw_pin  : 1,
+	 big_endn   : 1,
+	 syncrqs    : 1,
+	 sttlen     : 1,
+	 tx_eop     : 1,
+  	 rx_eop	    : 1;
+  u_char rbuf_size  : 5,
+	 rcvstop    : 1,
+  	 	    : 2;
+};
+
+#define I82593_MAX_MULTICAST_ADDRESSES	128	/* Hardware hashed filter */
+
+#endif /* _I82593_H */
-- 
cgit v1.2.3


From d83345adf96bc13a5e360f4649a2e68ef968dec0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 16 Nov 2009 03:36:51 +0000
Subject: net: add dev_txq_stats_fold() helper

Some drivers ndo_get_stats() method need to perform txqueue stats folding.

Move folding from dev_get_stats() to a new dev_txq_stats_fold() function

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 +
 net/core/dev.c            | 48 ++++++++++++++++++++++++++++-------------------
 2 files changed, 30 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7043f85e643d..c8fa4627de00 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1941,6 +1941,7 @@ extern void		netdev_features_change(struct net_device *dev);
 extern void		dev_load(struct net *net, const char *name);
 extern void		dev_mcast_init(void);
 extern const struct net_device_stats *dev_get_stats(struct net_device *dev);
+extern void		dev_txq_stats_fold(const struct net_device *dev, struct net_device_stats *stats);
 
 extern int		netdev_max_backlog;
 extern int		weight_p;
diff --git a/net/core/dev.c b/net/core/dev.c
index d867522290b9..c3e0578d29d1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5168,6 +5168,32 @@ void netdev_run_todo(void)
 	}
 }
 
+/**
+ *	dev_txq_stats_fold - fold tx_queues stats
+ *	@dev: device to get statistics from
+ *	@stats: struct net_device_stats to hold results
+ */
+void dev_txq_stats_fold(const struct net_device *dev,
+			struct net_device_stats *stats)
+{
+	unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
+	unsigned int i;
+	struct netdev_queue *txq;
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		txq = netdev_get_tx_queue(dev, i);
+		tx_bytes   += txq->tx_bytes;
+		tx_packets += txq->tx_packets;
+		tx_dropped += txq->tx_dropped;
+	}
+	if (tx_bytes || tx_packets || tx_dropped) {
+		stats->tx_bytes   = tx_bytes;
+		stats->tx_packets = tx_packets;
+		stats->tx_dropped = tx_dropped;
+	}
+}
+EXPORT_SYMBOL(dev_txq_stats_fold);
+
 /**
  *	dev_get_stats	- get network device statistics
  *	@dev: device to get statistics from
@@ -5182,25 +5208,9 @@ const struct net_device_stats *dev_get_stats(struct net_device *dev)
 
 	if (ops->ndo_get_stats)
 		return ops->ndo_get_stats(dev);
-	else {
-		unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
-		struct net_device_stats *stats = &dev->stats;
-		unsigned int i;
-		struct netdev_queue *txq;
-
-		for (i = 0; i < dev->num_tx_queues; i++) {
-			txq = netdev_get_tx_queue(dev, i);
-			tx_bytes   += txq->tx_bytes;
-			tx_packets += txq->tx_packets;
-			tx_dropped += txq->tx_dropped;
-		}
-		if (tx_bytes || tx_packets || tx_dropped) {
-			stats->tx_bytes   = tx_bytes;
-			stats->tx_packets = tx_packets;
-			stats->tx_dropped = tx_dropped;
-		}
-		return stats;
-	}
+
+	dev_txq_stats_fold(dev, &dev->stats);
+	return &dev->stats;
 }
 EXPORT_SYMBOL(dev_get_stats);
 
-- 
cgit v1.2.3


From 395264d509aec45149745843d9a737140a1ece16 Mon Sep 17 00:00:00 2001
From: Octavian Purdila <opurdila@ixiacom.com>
Date: Mon, 16 Nov 2009 13:49:35 +0000
Subject: net: introduce NETDEV_UNREGISTER_PERNET

This new event is called once for each unique net namespace in batched
unregister operations (with the argument set to a random device from
that namespace) and once per device in non-batched unregister
operations.

It allows us to factorize some device unregister work such as clearing the
routing cache.

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/notifier.h |  1 +
 net/core/dev.c           | 29 +++++++++++++++++++++++++++--
 2 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 29714b8441b1..b0c3671d463c 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -202,6 +202,7 @@ static inline int notifier_to_errno(int ret)
 #define NETDEV_BONDING_OLDTYPE  0x000E
 #define NETDEV_BONDING_NEWTYPE  0x000F
 #define NETDEV_POST_INIT	0x0010
+#define NETDEV_UNREGISTER_PERNET 0x0011
 
 #define SYS_DOWN	0x0001	/* Notify of system down */
 #define SYS_RESTART	SYS_DOWN
diff --git a/net/core/dev.c b/net/core/dev.c
index c3e0578d29d1..e25fe5d9343b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1344,6 +1344,7 @@ rollback:
 				nb->notifier_call(nb, NETDEV_DOWN, dev);
 			}
 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
+			nb->notifier_call(nb, NETDEV_UNREGISTER_PERNET, dev);
 		}
 	}
 
@@ -4721,7 +4722,8 @@ static void net_set_todo(struct net_device *dev)
 
 static void rollback_registered_many(struct list_head *head)
 {
-	struct net_device *dev;
+	struct net_device *dev, *aux, *fdev;
+	LIST_HEAD(pernet_list);
 
 	BUG_ON(dev_boot_phase);
 	ASSERT_RTNL();
@@ -4779,8 +4781,24 @@ static void rollback_registered_many(struct list_head *head)
 
 	synchronize_net();
 
-	list_for_each_entry(dev, head, unreg_list)
+	list_for_each_entry_safe(dev, aux, head, unreg_list) {
+		int new_net = 1;
+		list_for_each_entry(fdev, &pernet_list, unreg_list) {
+			if (dev_net(dev) == dev_net(fdev)) {
+				new_net = 0;
+				dev_put(dev);
+				break;
+			}
+		}
+		if (new_net)
+			list_move(&dev->unreg_list, &pernet_list);
+	}
+
+	list_for_each_entry_safe(dev, aux, &pernet_list, unreg_list) {
+		call_netdevice_notifiers(NETDEV_UNREGISTER_PERNET, dev);
+		list_move(&dev->unreg_list, head);
 		dev_put(dev);
+	}
 }
 
 static void rollback_registered(struct net_device *dev)
@@ -5074,6 +5092,8 @@ static void netdev_wait_allrefs(struct net_device *dev)
 
 			/* Rebroadcast unregister notification */
 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+			/* don't resend NETDEV_UNREGISTER_PERNET, _PERNET users
+			 * should have already handle it the first time */
 
 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
 				     &dev->state)) {
@@ -5385,6 +5405,10 @@ EXPORT_SYMBOL(unregister_netdevice_queue);
  *	unregister_netdevice_many - unregister many devices
  *	@head: list of devices
  *
+ *	WARNING: Calling this modifies the given list
+ *	(in rollback_registered_many). It may change the order of the elements
+ *	in the list. However, you can assume it does not add or delete elements
+ *	to/from the list.
  */
 void unregister_netdevice_many(struct list_head *head)
 {
@@ -5504,6 +5528,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	   this device. They should clean all the things.
 	*/
 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+	call_netdevice_notifiers(NETDEV_UNREGISTER_PERNET, dev);
 
 	/*
 	 *	Flush the unicast and multicast chains
-- 
cgit v1.2.3


From e014debecd3ee3832e6476b3a9c948edfcfd1250 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 17 Nov 2009 05:59:21 +0000
Subject: linkwatch: linkwatch_forget_dev() to speedup device dismantle
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Herbert Xu a écrit :
> On Tue, Nov 17, 2009 at 04:26:04AM -0800, David Miller wrote:
>> Really, the link watch stuff is just due for a redesign.  I don't
>> think a simple hack is going to cut it this time, sorry Eric :-)
>
> I have no objections against any redesigns, but since the only
> caller of linkwatch_forget_dev runs in process context with the
> RTNL, it could also legally emit those events.

Thanks guys, here an updated version then, before linkwatch surgery ?

In this version, I force the event to be sent synchronously.

[PATCH net-next-2.6] linkwatch: linkwatch_forget_dev() to speedup device dismantle

time ip link del eth3.103 ; time ip link del eth3.104 ; time ip link del eth3.105

real	0m0.266s
user	0m0.000s
sys	0m0.001s

real	0m0.770s
user	0m0.000s
sys	0m0.000s

real	0m1.022s
user	0m0.000s
sys	0m0.000s

One problem of current schem in vlan dismantle phase is the
holding of device done by following chain :

vlan_dev_stop() ->
	netif_carrier_off(dev) ->
		linkwatch_fire_event(dev) ->
			dev_hold() ...

And __linkwatch_run_queue() runs up to one second later...

A generic fix to this problem is to add a linkwatch_forget_dev() method
to unlink the device from the list of watched devices.

dev->link_watch_next becomes dev->link_watch_list (and use a bit more memory),
to be able to unlink device in O(1).

After patch :
time ip link del eth3.103 ; time ip link del eth3.104 ; time ip link del eth3.105

real    0m0.024s
user    0m0.000s
sys     0m0.000s

real    0m0.032s
user    0m0.000s
sys     0m0.001s

real    0m0.033s
user    0m0.000s
sys     0m0.000s

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  3 +-
 net/core/dev.c            |  3 ++
 net/core/link_watch.c     | 94 ++++++++++++++++++++++++++++-------------------
 3 files changed, 62 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c8fa4627de00..97873e31661c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -896,7 +896,7 @@ struct net_device {
 	/* device index hash chain */
 	struct hlist_node	index_hlist;
 
-	struct net_device	*link_watch_next;
+	struct list_head	link_watch_list;
 
 	/* register/unregister state machine */
 	enum { NETREG_UNINITIALIZED=0,
@@ -1600,6 +1600,7 @@ static inline void dev_hold(struct net_device *dev)
  */
 
 extern void linkwatch_fire_event(struct net_device *dev);
+extern void linkwatch_forget_dev(struct net_device *dev);
 
 /**
  *	netif_carrier_ok - test if carrier present
diff --git a/net/core/dev.c b/net/core/dev.c
index e25fe5d9343b..c128af708ebf 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5085,6 +5085,8 @@ static void netdev_wait_allrefs(struct net_device *dev)
 {
 	unsigned long rebroadcast_time, warning_time;
 
+	linkwatch_forget_dev(dev);
+
 	rebroadcast_time = warning_time = jiffies;
 	while (atomic_read(&dev->refcnt) != 0) {
 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
@@ -5311,6 +5313,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	INIT_LIST_HEAD(&dev->napi_list);
 	INIT_LIST_HEAD(&dev->unreg_list);
+	INIT_LIST_HEAD(&dev->link_watch_list);
 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
 	setup(dev);
 	strcpy(dev->name, name);
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index bf8f7af699d7..5910b555a54a 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -35,7 +35,7 @@ static unsigned long linkwatch_nextevent;
 static void linkwatch_event(struct work_struct *dummy);
 static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event);
 
-static struct net_device *lweventlist;
+static LIST_HEAD(lweventlist);
 static DEFINE_SPINLOCK(lweventlist_lock);
 
 static unsigned char default_operstate(const struct net_device *dev)
@@ -89,8 +89,10 @@ static void linkwatch_add_event(struct net_device *dev)
 	unsigned long flags;
 
 	spin_lock_irqsave(&lweventlist_lock, flags);
-	dev->link_watch_next = lweventlist;
-	lweventlist = dev;
+	if (list_empty(&dev->link_watch_list)) {
+		list_add_tail(&dev->link_watch_list, &lweventlist);
+		dev_hold(dev);
+	}
 	spin_unlock_irqrestore(&lweventlist_lock, flags);
 }
 
@@ -133,9 +135,35 @@ static void linkwatch_schedule_work(int urgent)
 }
 
 
+static void linkwatch_do_dev(struct net_device *dev)
+{
+	/*
+	 * Make sure the above read is complete since it can be
+	 * rewritten as soon as we clear the bit below.
+	 */
+	smp_mb__before_clear_bit();
+
+	/* We are about to handle this device,
+	 * so new events can be accepted
+	 */
+	clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
+
+	rfc2863_policy(dev);
+	if (dev->flags & IFF_UP) {
+		if (netif_carrier_ok(dev))
+			dev_activate(dev);
+		else
+			dev_deactivate(dev);
+
+		netdev_state_change(dev);
+	}
+	dev_put(dev);
+}
+
 static void __linkwatch_run_queue(int urgent_only)
 {
-	struct net_device *next;
+	struct net_device *dev;
+	LIST_HEAD(wrk);
 
 	/*
 	 * Limit the number of linkwatch events to one
@@ -153,46 +181,40 @@ static void __linkwatch_run_queue(int urgent_only)
 	clear_bit(LW_URGENT, &linkwatch_flags);
 
 	spin_lock_irq(&lweventlist_lock);
-	next = lweventlist;
-	lweventlist = NULL;
-	spin_unlock_irq(&lweventlist_lock);
+	list_splice_init(&lweventlist, &wrk);
 
-	while (next) {
-		struct net_device *dev = next;
+	while (!list_empty(&wrk)) {
 
-		next = dev->link_watch_next;
+		dev = list_first_entry(&wrk, struct net_device, link_watch_list);
+		list_del_init(&dev->link_watch_list);
 
 		if (urgent_only && !linkwatch_urgent_event(dev)) {
-			linkwatch_add_event(dev);
+			list_add_tail(&dev->link_watch_list, &lweventlist);
 			continue;
 		}
-
-		/*
-		 * Make sure the above read is complete since it can be
-		 * rewritten as soon as we clear the bit below.
-		 */
-		smp_mb__before_clear_bit();
-
-		/* We are about to handle this device,
-		 * so new events can be accepted
-		 */
-		clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
-
-		rfc2863_policy(dev);
-		if (dev->flags & IFF_UP) {
-			if (netif_carrier_ok(dev))
-				dev_activate(dev);
-			else
-				dev_deactivate(dev);
-
-			netdev_state_change(dev);
-		}
-
-		dev_put(dev);
+		spin_unlock_irq(&lweventlist_lock);
+		linkwatch_do_dev(dev);
+		spin_lock_irq(&lweventlist_lock);
 	}
 
-	if (lweventlist)
+	if (!list_empty(&lweventlist))
 		linkwatch_schedule_work(0);
+	spin_unlock_irq(&lweventlist_lock);
+}
+
+void linkwatch_forget_dev(struct net_device *dev)
+{
+	unsigned long flags;
+	int clean = 0;
+
+	spin_lock_irqsave(&lweventlist_lock, flags);
+	if (!list_empty(&dev->link_watch_list)) {
+		list_del_init(&dev->link_watch_list);
+		clean = 1;
+	}
+	spin_unlock_irqrestore(&lweventlist_lock, flags);
+	if (clean)
+		linkwatch_do_dev(dev);
 }
 
 
@@ -216,8 +238,6 @@ void linkwatch_fire_event(struct net_device *dev)
 	bool urgent = linkwatch_urgent_event(dev);
 
 	if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) {
-		dev_hold(dev);
-
 		linkwatch_add_event(dev);
 	} else if (!urgent)
 		return;
-- 
cgit v1.2.3


From 2ea6dec4a22a6f66f6633876212fd4d195cf8277 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 17 Nov 2009 14:27:27 -0800
Subject: generic-ipi: Add smp_call_function_any()

Andrew points out that acpi-cpufreq uses cpumask_any, when it really
would prefer to use the same CPU if possible (to avoid an IPI).  In
general, this seems a good idea to offer.

[ tglx: Documented selection preference and Inlined the UP case to
  	avoid the copy of smp_call_function_single() and the extra
  	EXPORT ]

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Zhao Yakui <yakui.zhao@intel.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Mike Galbraith <efault@gmx.de>
Cc: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/smp.h | 11 ++++++++++-
 kernel/smp.c        | 45 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 39c64bae776d..7a0570e6a596 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -76,6 +76,9 @@ void smp_call_function_many(const struct cpumask *mask,
 void __smp_call_function_single(int cpuid, struct call_single_data *data,
 				int wait);
 
+int smp_call_function_any(const struct cpumask *mask,
+			  void (*func)(void *info), void *info, int wait);
+
 /*
  * Generic and arch helpers
  */
@@ -137,9 +140,15 @@ static inline void smp_send_reschedule(int cpu) { }
 #define smp_prepare_boot_cpu()			do {} while (0)
 #define smp_call_function_many(mask, func, info, wait) \
 			(up_smp_call_function(func, info))
-static inline void init_call_single_data(void)
+static inline void init_call_single_data(void) { }
+
+static inline int
+smp_call_function_any(const struct cpumask *mask, void (*func)(void *info),
+		      void *info, int wait)
 {
+	return smp_call_function_single(0, func, info, wait);
 }
+
 #endif /* !SMP */
 
 /*
diff --git a/kernel/smp.c b/kernel/smp.c
index 8bd618f0364d..a8c76069cf50 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -319,6 +319,51 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 }
 EXPORT_SYMBOL(smp_call_function_single);
 
+/*
+ * smp_call_function_any - Run a function on any of the given cpus
+ * @mask: The mask of cpus it can run on.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait until function has completed.
+ *
+ * Returns 0 on success, else a negative status code (if no cpus were online).
+ * Note that @wait will be implicitly turned on in case of allocation failures,
+ * since we fall back to on-stack allocation.
+ *
+ * Selection preference:
+ *	1) current cpu if in @mask
+ *	2) any cpu of current node if in @mask
+ *	3) any other online cpu in @mask
+ */
+int smp_call_function_any(const struct cpumask *mask,
+			  void (*func)(void *info), void *info, int wait)
+{
+	unsigned int cpu;
+	const struct cpumask *nodemask;
+	int ret;
+
+	/* Try for same CPU (cheapest) */
+	cpu = get_cpu();
+	if (cpumask_test_cpu(cpu, mask))
+		goto call;
+
+	/* Try for same node. */
+	nodemask = cpumask_of_node(cpu);
+	for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
+	     cpu = cpumask_next_and(cpu, nodemask, mask)) {
+		if (cpu_online(cpu))
+			goto call;
+	}
+
+	/* Any online will do: smp_call_function_single handles nr_cpu_ids. */
+	cpu = cpumask_any_and(mask, cpu_online_mask);
+call:
+	ret = smp_call_function_single(cpu, func, info, wait);
+	put_cpu();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(smp_call_function_any);
+
 /**
  * __smp_call_function_single(): Run a function on another CPU
  * @cpu: The CPU to run on.
-- 
cgit v1.2.3


From 60a0a52df149286a25fddf9b2d0cfe77cf0bc516 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 16 Nov 2009 02:30:16 -0800
Subject: sysctl: kill dead ctl_handler definitions.

When removing the sysctl strategy routines I overlooked their definitions
in sysctl.h.    So remove those unnecessary definitions now.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/sysctl.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 4e40442777cf..b4f6adc6a02c 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -972,10 +972,6 @@ extern int sysctl_perm(struct ctl_table_root *root,
 
 typedef struct ctl_table ctl_table;
 
-typedef int ctl_handler (struct ctl_table *table,
-			 void __user *oldval, size_t __user *oldlenp,
-			 void __user *newval, size_t newlen);
-
 typedef int proc_handler (struct ctl_table *ctl, int write,
 			  void __user *buffer, size_t *lenp, loff_t *ppos);
 
@@ -996,13 +992,6 @@ extern int proc_doulongvec_minmax(struct ctl_table *, int,
 extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int,
 				      void __user *, size_t *, loff_t *);
 
-extern ctl_handler sysctl_data;
-extern ctl_handler sysctl_string;
-extern ctl_handler sysctl_intvec;
-extern ctl_handler sysctl_jiffies;
-extern ctl_handler sysctl_ms_jiffies;
-
-
 /*
  * Register a set of sysctl names by calling register_sysctl_table
  * with an initialised array of struct ctl_table's.  An entry with 
-- 
cgit v1.2.3


From 86926d0096279b9739ceeff40f68d3c33b9119a9 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 16 Nov 2009 02:40:01 -0800
Subject: sysctl: Remove CTL_NONE and CTL_UNNUMBERED

Now that the sysctl structures no longer have a ctl_name field
there is no reason to retain the definitions for CTL_NONE and
CTL_UNNUMBERED, or to explain their historic usage.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 Documentation/sysctl/ctl_unnumbered.txt | 22 ----------------------
 include/linux/sysctl.h                  |  9 ---------
 2 files changed, 31 deletions(-)
 delete mode 100644 Documentation/sysctl/ctl_unnumbered.txt

(limited to 'include/linux')

diff --git a/Documentation/sysctl/ctl_unnumbered.txt b/Documentation/sysctl/ctl_unnumbered.txt
deleted file mode 100644
index 23003a8ea3e7..000000000000
--- a/Documentation/sysctl/ctl_unnumbered.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-
-Except for a few extremely rare exceptions user space applications do not use
-the binary sysctl interface.  Instead everyone uses /proc/sys/...  with
-readable ascii names.
-
-Recently the kernel has started supporting setting the binary sysctl value to
-CTL_UNNUMBERED so we no longer need to assign a binary sysctl path to allow
-sysctls to show up in /proc/sys.
-
-Assigning binary sysctl numbers is an endless source of conflicts in sysctl.h,
-breaking of the user space ABI (because of those conflicts), and maintenance
-problems.  A complete pass through all of the sysctl users revealed multiple
-instances where the sysctl binary interface was broken and had gone undetected
-for years.
-
-So please do not add new binary sysctl numbers.  They are unneeded and
-problematic.
-
-If you really need a new binary sysctl number please first merge your sysctl
-into the kernel and then as a separate patch allocate a binary sysctl number.
-
-(ebiederm@xmission.com, June 2007)
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index b4f6adc6a02c..c83a86a22381 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -15,9 +15,6 @@
  **  The kernel will then return -ENOTDIR to any application using
  **  the old binary interface.
  **
- **  For new interfaces unless you really need a binary number
- **  please use CTL_UNNUMBERED.
- **
  ****************************************************************
  ****************************************************************
  */
@@ -50,12 +47,6 @@ struct __sysctl_args {
 
 /* Top-level names: */
 
-/* For internal pattern-matching use only: */
-#ifdef __KERNEL__
-#define CTL_NONE	0
-#define CTL_UNNUMBERED	CTL_NONE	/* sysctl without a binary number */
-#endif
-
 enum
 {
 	CTL_KERN=1,		/* General kernel info and control */
-- 
cgit v1.2.3


From c95cf3d09adc9afe7816a13a920b6df36062a3fe Mon Sep 17 00:00:00 2001
From: David-John Willis <John.Willis@Distant-earth.com>
Date: Tue, 17 Nov 2009 18:50:09 +0200
Subject: wl1251: add NVS in EEPROM support

wl1251 supports also that NVS is stored in a separate EEPROM, add support
for that.

kvalo: use platform data instead Kconfig and use kernel style

Signed-off-by: David-John Willis <John.Willis@Distant-earth.com>
Signed-off-by: Kalle Valo <kalle.valo@nokia.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/wl12xx/wl1251.h      |  1 +
 drivers/net/wireless/wl12xx/wl1251_boot.c | 20 +++++++++++++-------
 drivers/net/wireless/wl12xx/wl1251_reg.h  |  6 ++++++
 drivers/net/wireless/wl12xx/wl1251_spi.c  |  2 ++
 include/linux/spi/wl12xx.h                |  1 +
 5 files changed, 23 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/wl12xx/wl1251.h b/drivers/net/wireless/wl12xx/wl1251.h
index a839466664f0..054533f7a124 100644
--- a/drivers/net/wireless/wl12xx/wl1251.h
+++ b/drivers/net/wireless/wl12xx/wl1251.h
@@ -269,6 +269,7 @@ struct wl1251 {
 
 	void (*set_power)(bool enable);
 	int irq;
+	bool use_eeprom;
 
 	enum wl1251_state state;
 	struct mutex mutex;
diff --git a/drivers/net/wireless/wl12xx/wl1251_boot.c b/drivers/net/wireless/wl12xx/wl1251_boot.c
index 5094f24ad034..2e733e7bdfd4 100644
--- a/drivers/net/wireless/wl12xx/wl1251_boot.c
+++ b/drivers/net/wireless/wl12xx/wl1251_boot.c
@@ -494,13 +494,19 @@ int wl1251_boot(struct wl1251 *wl)
 		goto out;
 
 	/* 2. start processing NVS file */
-	ret = wl1251_boot_upload_nvs(wl);
-	if (ret < 0)
-		goto out;
-
-	/* write firmware's last address (ie. it's length) to
-	 * ACX_EEPROMLESS_IND_REG */
-	wl1251_reg_write32(wl, ACX_EEPROMLESS_IND_REG, wl->fw_len);
+	if (wl->use_eeprom) {
+		wl1251_reg_write32(wl, ACX_REG_EE_START, START_EEPROM_MGR);
+		msleep(4000);
+		wl1251_reg_write32(wl, ACX_EEPROMLESS_IND_REG, USE_EEPROM);
+	} else {
+		ret = wl1251_boot_upload_nvs(wl);
+		if (ret < 0)
+			goto out;
+
+		/* write firmware's last address (ie. it's length) to
+		 * ACX_EEPROMLESS_IND_REG */
+		wl1251_reg_write32(wl, ACX_EEPROMLESS_IND_REG, wl->fw_len);
+	}
 
 	/* 6. read the EEPROM parameters */
 	tmp = wl1251_reg_read32(wl, SCR_PAD2);
diff --git a/drivers/net/wireless/wl12xx/wl1251_reg.h b/drivers/net/wireless/wl12xx/wl1251_reg.h
index 06e1bd94a739..0ca3b4326056 100644
--- a/drivers/net/wireless/wl12xx/wl1251_reg.h
+++ b/drivers/net/wireless/wl12xx/wl1251_reg.h
@@ -370,6 +370,7 @@ enum wl12xx_acx_int_reg {
  EEPROM location specified in the EE_ADDR register.
  The Wlan hardware hardware clears this bit automatically.
 *===============================================*/
+#define EE_CTL                              (REGISTERS_BASE + 0x2000)
 #define ACX_EE_CTL_REG                      EE_CTL
 #define EE_WRITE                            0x00000001ul
 #define EE_READ                             0x00000002ul
@@ -380,6 +381,7 @@ enum wl12xx_acx_int_reg {
   This register specifies the address
   within the EEPROM from/to which to read/write data.
   ===============================================*/
+#define EE_ADDR                             (REGISTERS_BASE + 0x2008)
 #define ACX_EE_ADDR_REG                     EE_ADDR
 
 /*===============================================
@@ -389,8 +391,12 @@ enum wl12xx_acx_int_reg {
   data from the EEPROM or the write data
   to be written to the EEPROM.
   ===============================================*/
+#define EE_DATA                             (REGISTERS_BASE + 0x2004)
 #define ACX_EE_DATA_REG                     EE_DATA
 
+#define EEPROM_ACCESS_TO                    10000   /* timeout counter */
+#define START_EEPROM_MGR                    0x00000001
+
 /*===============================================
   EEPROM Base Address  - 32bit RW
   ------------------------------------------
diff --git a/drivers/net/wireless/wl12xx/wl1251_spi.c b/drivers/net/wireless/wl12xx/wl1251_spi.c
index 2cf8a2169d43..9cc8c323830f 100644
--- a/drivers/net/wireless/wl12xx/wl1251_spi.c
+++ b/drivers/net/wireless/wl12xx/wl1251_spi.c
@@ -270,6 +270,8 @@ static int __devinit wl1251_spi_probe(struct spi_device *spi)
 		return -ENODEV;
 	}
 
+	wl->use_eeprom = pdata->use_eeprom;
+
 	ret = request_irq(wl->irq, wl1251_irq, 0, DRIVER_NAME, wl);
 	if (ret < 0) {
 		wl1251_error("request_irq() failed: %d", ret);
diff --git a/include/linux/spi/wl12xx.h b/include/linux/spi/wl12xx.h
index 11430cab2aad..aed64ed3dc8a 100644
--- a/include/linux/spi/wl12xx.h
+++ b/include/linux/spi/wl12xx.h
@@ -26,6 +26,7 @@
 
 struct wl12xx_platform_data {
 	void (*set_power)(bool enable);
+	bool use_eeprom;
 };
 
 #endif
-- 
cgit v1.2.3


From 0878c3504f92f1bf063d0890a9960d4b9e6c4618 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Wed, 18 Nov 2009 16:48:00 +0100
Subject: rfkill: Add missing description for RFKILL_TYPE_GPS

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Janakiram Sistla <janakiram.sistla@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/rfkill.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h
index 3392c59d2706..a75e9e566ce5 100644
--- a/include/linux/rfkill.h
+++ b/include/linux/rfkill.h
@@ -35,6 +35,7 @@
  * @RFKILL_TYPE_UWB: switch is on a ultra wideband device.
  * @RFKILL_TYPE_WIMAX: switch is on a WiMAX device.
  * @RFKILL_TYPE_WWAN: switch is on a wireless WAN device.
+ * @RFKILL_TYPE_GPS: switch is on a GPS device.
  * @NUM_RFKILL_TYPES: number of defined rfkill types
  */
 enum rfkill_type {
-- 
cgit v1.2.3


From 875405a7793e9c35fab33819e7e5df7a98b6064c Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Wed, 18 Nov 2009 16:48:01 +0100
Subject: rfkill: Add constant for RFKILL_TYPE_FM radio devices

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Janakiram Sistla <janakiram.sistla@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/rfkill.h | 2 ++
 net/rfkill/core.c      | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h
index a75e9e566ce5..97059d08a626 100644
--- a/include/linux/rfkill.h
+++ b/include/linux/rfkill.h
@@ -36,6 +36,7 @@
  * @RFKILL_TYPE_WIMAX: switch is on a WiMAX device.
  * @RFKILL_TYPE_WWAN: switch is on a wireless WAN device.
  * @RFKILL_TYPE_GPS: switch is on a GPS device.
+ * @RFKILL_TYPE_FM: switch is on a FM radio device.
  * @NUM_RFKILL_TYPES: number of defined rfkill types
  */
 enum rfkill_type {
@@ -46,6 +47,7 @@ enum rfkill_type {
 	RFKILL_TYPE_WIMAX,
 	RFKILL_TYPE_WWAN,
 	RFKILL_TYPE_GPS,
+	RFKILL_TYPE_FM,
 	NUM_RFKILL_TYPES,
 };
 
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index ba2efb960c60..09f4e161799b 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -592,11 +592,13 @@ static const char *rfkill_get_type_str(enum rfkill_type type)
 		return "wwan";
 	case RFKILL_TYPE_GPS:
 		return "gps";
+	case RFKILL_TYPE_FM:
+		return "fm";
 	default:
 		BUG();
 	}
 
-	BUILD_BUG_ON(NUM_RFKILL_TYPES != RFKILL_TYPE_GPS + 1);
+	BUILD_BUG_ON(NUM_RFKILL_TYPES != RFKILL_TYPE_FM + 1);
 }
 
 static ssize_t rfkill_type_show(struct device *dev,
-- 
cgit v1.2.3


From 136cfa28615ccce0f9374811480e0b81c4191ea5 Mon Sep 17 00:00:00 2001
From: Rui Paulo <rpaulo@gmail.com>
Date: Wed, 18 Nov 2009 18:40:00 +0000
Subject: mac80211: use a structure to hold the mesh config information element

Signed-off-by: Rui Paulo <rpaulo@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/ieee80211.h  | 16 +++++++++++++++-
 net/mac80211/ieee80211_i.h |  3 +--
 net/mac80211/mesh.c        | 23 ++++++++---------------
 net/mac80211/util.c        |  4 ++--
 net/wireless/scan.c        |  9 +++++----
 5 files changed, 31 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 49b1abd2fe97..afa8e0ac27a7 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -115,7 +115,6 @@
 #define IEEE80211_MAX_SSID_LEN		32
 
 #define IEEE80211_MAX_MESH_ID_LEN	32
-#define IEEE80211_MESH_CONFIG_LEN	7
 
 #define IEEE80211_QOS_CTL_LEN		2
 #define IEEE80211_QOS_CTL_TID_MASK	0x000F
@@ -554,6 +553,21 @@ struct ieee80211_tim_ie {
 	u8 virtual_map[1];
 } __attribute__ ((packed));
 
+/**
+ * struct ieee80211_meshconf_ie
+ *
+ * This structure refers to "Mesh Configuration information element"
+ */
+struct ieee80211_meshconf_ie {
+	u8 meshconf_psel;
+	u8 meshconf_pmetric;
+	u8 meshconf_congest;
+	u8 meshconf_synch;
+	u8 meshconf_auth;
+	u8 meshconf_form;
+	u8 meshconf_cap;
+} __attribute__ ((packed));
+
 /**
  * struct ieee80211_rann_ie
  *
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 74e79935de0a..87d27f450a05 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -798,7 +798,7 @@ struct ieee802_11_elems {
 	u8 *wmm_param;
 	struct ieee80211_ht_cap *ht_cap_elem;
 	struct ieee80211_ht_info *ht_info_elem;
-	u8 *mesh_config;
+	struct ieee80211_meshconf_ie *mesh_config;
 	u8 *mesh_id;
 	u8 *peer_link;
 	u8 *preq;
@@ -826,7 +826,6 @@ struct ieee802_11_elems {
 	u8 ext_supp_rates_len;
 	u8 wmm_info_len;
 	u8 wmm_param_len;
-	u8 mesh_config_len;
 	u8 mesh_id_len;
 	u8 peer_link_len;
 	u8 preq_len;
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 3a0683ba357b..51adb1115215 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -16,12 +16,6 @@
 #define IEEE80211_MESH_HOUSEKEEPING_INTERVAL (60 * HZ)
 #define IEEE80211_MESH_RANN_INTERVAL	     (1 * HZ)
 
-#define MESHCONF_PP_OFFSET 	0		/* Path Selection Protocol */
-#define MESHCONF_PM_OFFSET	1		/* Path Selection Metric   */
-#define MESHCONF_CC_OFFSET	2		/* Congestion Control Mode */
-#define MESHCONF_SP_OFFSET	3		/* Synchronization Protocol */
-#define MESHCONF_AUTH_OFFSET	4		/* Authentication Protocol */
-#define MESHCONF_CAPAB_OFFSET 	6
 #define MESHCONF_CAPAB_ACCEPT_PLINKS 0x01
 #define MESHCONF_CAPAB_FORWARDING    0x08
 
@@ -87,12 +81,11 @@ bool mesh_matches_local(struct ieee802_11_elems *ie, struct ieee80211_sub_if_dat
 	 */
 	if (ifmsh->mesh_id_len == ie->mesh_id_len &&
 		memcmp(ifmsh->mesh_id, ie->mesh_id, ie->mesh_id_len) == 0 &&
-		(ifmsh->mesh_pp_id == *(ie->mesh_config + MESHCONF_PP_OFFSET))&&
-		(ifmsh->mesh_pm_id == *(ie->mesh_config + MESHCONF_PM_OFFSET))&&
-		(ifmsh->mesh_cc_id == *(ie->mesh_config + MESHCONF_CC_OFFSET))&&
-		(ifmsh->mesh_sp_id == *(ie->mesh_config + MESHCONF_SP_OFFSET))&&
-		(ifmsh->mesh_auth_id == *(ie->mesh_config +
-		    MESHCONF_AUTH_OFFSET)))
+		(ifmsh->mesh_pp_id == ie->mesh_config->meshconf_psel) &&
+		(ifmsh->mesh_pm_id == ie->mesh_config->meshconf_pmetric) &&
+		(ifmsh->mesh_cc_id == ie->mesh_config->meshconf_congest) &&
+		(ifmsh->mesh_sp_id == ie->mesh_config->meshconf_synch) &&
+		(ifmsh->mesh_auth_id == ie->mesh_config->meshconf_auth))
 		return true;
 
 	return false;
@@ -105,7 +98,7 @@ bool mesh_matches_local(struct ieee802_11_elems *ie, struct ieee80211_sub_if_dat
  */
 bool mesh_peer_accepts_plinks(struct ieee802_11_elems *ie)
 {
-	return (*(ie->mesh_config + MESHCONF_CAPAB_OFFSET) &
+	return (ie->mesh_config->meshconf_cap &
 	    MESHCONF_CAPAB_ACCEPT_PLINKS) != 0;
 }
 
@@ -262,9 +255,9 @@ void mesh_mgmt_ies_add(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata)
 	if (sdata->u.mesh.mesh_id_len)
 		memcpy(pos, sdata->u.mesh.mesh_id, sdata->u.mesh.mesh_id_len);
 
-	pos = skb_put(skb, 2 + IEEE80211_MESH_CONFIG_LEN);
+	pos = skb_put(skb, 2 + sizeof(struct ieee80211_meshconf_ie));
 	*pos++ = WLAN_EID_MESH_CONFIG;
-	*pos++ = IEEE80211_MESH_CONFIG_LEN;
+	*pos++ = sizeof(struct ieee80211_meshconf_ie);
 
 	/* Active path selection protocol ID */
 	*pos++ = sdata->u.mesh.mesh_pp_id;
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 5ae1bf389849..2fb0432ac830 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -666,8 +666,8 @@ u32 ieee802_11_parse_elems_crc(u8 *start, size_t len,
 			elems->mesh_id_len = elen;
 			break;
 		case WLAN_EID_MESH_CONFIG:
-			elems->mesh_config = pos;
-			elems->mesh_config_len = elen;
+			if (elen >= sizeof(struct ieee80211_meshconf_ie))
+				elems->mesh_config = (void *)pos;
 			break;
 		case WLAN_EID_PEER_LINK:
 			elems->peer_link = pos;
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index e2d344ff6745..d03447d68cdd 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -217,7 +217,7 @@ static bool is_mesh(struct cfg80211_bss *a,
 		     a->len_information_elements);
 	if (!ie)
 		return false;
-	if (ie[1] != IEEE80211_MESH_CONFIG_LEN)
+	if (ie[1] != sizeof(struct ieee80211_meshconf_ie))
 		return false;
 
 	/*
@@ -225,7 +225,8 @@ static bool is_mesh(struct cfg80211_bss *a,
 	 * comparing since that may differ between stations taking
 	 * part in the same mesh.
 	 */
-	return memcmp(ie + 2, meshcfg, IEEE80211_MESH_CONFIG_LEN - 2) == 0;
+	return memcmp(ie + 2, meshcfg,
+	    sizeof(struct ieee80211_meshconf_ie) - 2) == 0;
 }
 
 static int cmp_bss(struct cfg80211_bss *a,
@@ -399,7 +400,7 @@ cfg80211_bss_update(struct cfg80211_registered_device *dev,
 				  res->pub.information_elements,
 				  res->pub.len_information_elements);
 		if (!meshid || !meshcfg ||
-		    meshcfg[1] != IEEE80211_MESH_CONFIG_LEN) {
+		    meshcfg[1] != sizeof(struct ieee80211_meshconf_ie)) {
 			/* bogus mesh */
 			kref_put(&res->ref, bss_release);
 			return NULL;
@@ -865,7 +866,7 @@ ieee80211_bss(struct wiphy *wiphy, struct iw_request_info *info,
 			break;
 		case WLAN_EID_MESH_CONFIG:
 			ismesh = true;
-			if (ie[1] != IEEE80211_MESH_CONFIG_LEN)
+			if (ie[1] != sizeof(struct ieee80211_meshconf_ie))
 				break;
 			buf = kmalloc(50, GFP_ATOMIC);
 			if (!buf)
-- 
cgit v1.2.3


From 386e50cc7d82b3799ea6f53267f04f123ae05afe Mon Sep 17 00:00:00 2001
From: Andrew Hendry <andrew.hendry@gmail.com>
Date: Wed, 18 Nov 2009 23:30:41 -0800
Subject: X25: Enable setting of cause and diagnostic fields

Adds SIOCX25SCAUSEDIAG, allowing X.25 programs to set the cause and
diagnostic fields.

Normally used to indicate status upon closing connections.

Signed-off-by: Andrew Hendry <andrew.hendry@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/x25.h |  1 +
 net/x25/af_x25.c    | 12 ++++++++++++
 net/x25/x25_subr.c  |  6 ++++++
 3 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/x25.h b/include/linux/x25.h
index d035e4e87d07..6450a7f12074 100644
--- a/include/linux/x25.h
+++ b/include/linux/x25.h
@@ -25,6 +25,7 @@
 #define SIOCX25SENDCALLACCPT    (SIOCPROTOPRIVATE + 9)
 #define SIOCX25GDTEFACILITIES (SIOCPROTOPRIVATE + 10)
 #define SIOCX25SDTEFACILITIES (SIOCPROTOPRIVATE + 11)
+#define SIOCX25SCAUSEDIAG	(SIOCPROTOPRIVATE + 12)
 
 /*
  *	Values for {get,set}sockopt.
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 39ce03e07d18..ac7dba46fa33 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -1471,6 +1471,17 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 			break;
 		}
 
+		case SIOCX25SCAUSEDIAG: {
+			struct x25_causediag causediag;
+			rc = -EFAULT;
+			if (copy_from_user(&causediag, argp, sizeof(causediag)))
+				break;
+			x25->causediag = causediag;
+			rc = 0;
+			break;
+
+		}
+
 		case SIOCX25SCUDMATCHLEN: {
 			struct x25_subaddr sub_addr;
 			rc = -EINVAL;
@@ -1639,6 +1650,7 @@ static int compat_x25_ioctl(struct socket *sock, unsigned int cmd,
 	case SIOCX25GCALLUSERDATA:
 	case SIOCX25SCALLUSERDATA:
 	case SIOCX25GCAUSEDIAG:
+	case SIOCX25SCAUSEDIAG:
 	case SIOCX25SCUDMATCHLEN:
 	case SIOCX25CALLACCPTAPPRV:
 	case SIOCX25SENDCALLACCPT:
diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c
index 511a5986af3e..352b32d216fc 100644
--- a/net/x25/x25_subr.c
+++ b/net/x25/x25_subr.c
@@ -225,6 +225,12 @@ void x25_write_internal(struct sock *sk, int frametype)
 			break;
 
 		case X25_CLEAR_REQUEST:
+			dptr    = skb_put(skb, 3);
+			*dptr++ = frametype;
+			*dptr++ = x25->causediag.cause;
+			*dptr++ = x25->causediag.diagnostic;
+			break;
+
 		case X25_RESET_REQUEST:
 			dptr    = skb_put(skb, 3);
 			*dptr++ = frametype;
-- 
cgit v1.2.3


From ad4bb6f8883a13bb0f65b194dae36c62a02ac779 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 19 Nov 2009 00:56:30 +0100
Subject: cfg80211: disallow bridging managed/adhoc interfaces

A number of people have tried to add a wireless interface
(in managed mode) to a bridge and then complained that it
doesn't work. It cannot work, however, because in 802.11
networks all packets need to be acknowledged and as such
need to be sent to the right address. Promiscuous doesn't
help here. The wireless address format used for these
links has only space for three addresses, the
 * transmitter, which must be equal to the sender (origin)
 * receiver (on the wireless medium), which is the AP in
   the case of managed mode
 * the recipient (destination), which is on the APs local
   network segment

In an IBSS, it is similar, but the receiver and recipient
must match and the third address is used as the BSSID.

To avoid such mistakes in the future, disallow adding a
wireless interface to a bridge.

Felix has recently added a four-address mode to the AP
and client side that can be used (after negotiating that
it is possible, which must happen out-of-band by setting
up both sides) for bridging, so allow that case.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/if.h     |  1 +
 net/bridge/br_if.c     |  4 ++++
 net/wireless/core.c    |  4 ++++
 net/wireless/nl80211.c | 12 ++++++++----
 net/wireless/util.c    | 31 +++++++++++++++++++++++++++++++
 5 files changed, 48 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if.h b/include/linux/if.h
index 3b2a46bf8f8d..3a9f410a296b 100644
--- a/include/linux/if.h
+++ b/include/linux/if.h
@@ -70,6 +70,7 @@
 #define IFF_XMIT_DST_RELEASE 0x400	/* dev_hard_start_xmit() is allowed to
 					 * release skb->dst
 					 */
+#define IFF_DONT_BRIDGE 0x800		/* disallow bridging this ether dev */
 
 #define IF_GET_IFACE	0x0001		/* for querying only */
 #define IF_GET_PROTO	0x0002
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index a6f74b2b9571..a2cbe61f6e65 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -390,6 +390,10 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 	if (dev->br_port != NULL)
 		return -EBUSY;
 
+	/* No bridging devices that dislike that (e.g. wireless) */
+	if (dev->priv_flags & IFF_DONT_BRIDGE)
+		return -EOPNOTSUPP;
+
 	p = new_nbp(br, dev);
 	if (IS_ERR(p))
 		return PTR_ERR(p);
diff --git a/net/wireless/core.c b/net/wireless/core.c
index e2cc6e7522dd..fc5e9b508607 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -697,6 +697,10 @@ static int cfg80211_netdev_notifier_call(struct notifier_block * nb,
 #endif
 		if (!dev->ethtool_ops)
 			dev->ethtool_ops = &cfg80211_ethtool_ops;
+
+		if ((wdev->iftype == NL80211_IFTYPE_STATION ||
+		     wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr)
+			dev->priv_flags |= IFF_DONT_BRIDGE;
 		break;
 	case NETDEV_GOING_DOWN:
 		switch (wdev->iftype) {
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index b7b0f67b0c61..149539ade15e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -969,10 +969,14 @@ static int parse_monitor_flags(struct nlattr *nla, u32 *mntrflags)
 }
 
 static int nl80211_valid_4addr(struct cfg80211_registered_device *rdev,
-			       u8 use_4addr, enum nl80211_iftype iftype)
+			       struct net_device *netdev, u8 use_4addr,
+			       enum nl80211_iftype iftype)
 {
-	if (!use_4addr)
+	if (!use_4addr) {
+		if (netdev && netdev->br_port)
+			return -EBUSY;
 		return 0;
+	}
 
 	switch (iftype) {
 	case NL80211_IFTYPE_AP_VLAN:
@@ -1033,7 +1037,7 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 	if (info->attrs[NL80211_ATTR_4ADDR]) {
 		params.use_4addr = !!nla_get_u8(info->attrs[NL80211_ATTR_4ADDR]);
 		change = true;
-		err = nl80211_valid_4addr(rdev, params.use_4addr, ntype);
+		err = nl80211_valid_4addr(rdev, dev, params.use_4addr, ntype);
 		if (err)
 			goto unlock;
 	} else {
@@ -1111,7 +1115,7 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
 
 	if (info->attrs[NL80211_ATTR_4ADDR]) {
 		params.use_4addr = !!nla_get_u8(info->attrs[NL80211_ATTR_4ADDR]);
-		err = nl80211_valid_4addr(rdev, params.use_4addr, type);
+		err = nl80211_valid_4addr(rdev, NULL, params.use_4addr, type);
 		if (err)
 			goto unlock;
 	}
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 17a7a4cfc617..59361fdcb5d0 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -658,6 +658,11 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
 	    !(rdev->wiphy.interface_modes & (1 << ntype)))
 		return -EOPNOTSUPP;
 
+	/* if it's part of a bridge, reject changing type to station/ibss */
+	if (dev->br_port && (ntype == NL80211_IFTYPE_ADHOC ||
+			     ntype == NL80211_IFTYPE_STATION))
+		return -EBUSY;
+
 	if (ntype != otype) {
 		dev->ieee80211_ptr->use_4addr = false;
 
@@ -687,5 +692,31 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
 	if (!err && params && params->use_4addr != -1)
 		dev->ieee80211_ptr->use_4addr = params->use_4addr;
 
+	if (!err) {
+		dev->priv_flags &= ~IFF_DONT_BRIDGE;
+		switch (ntype) {
+		case NL80211_IFTYPE_STATION:
+			if (dev->ieee80211_ptr->use_4addr)
+				break;
+			/* fall through */
+		case NL80211_IFTYPE_ADHOC:
+			dev->priv_flags |= IFF_DONT_BRIDGE;
+			break;
+		case NL80211_IFTYPE_AP:
+		case NL80211_IFTYPE_AP_VLAN:
+		case NL80211_IFTYPE_WDS:
+		case NL80211_IFTYPE_MESH_POINT:
+			/* bridging OK */
+			break;
+		case NL80211_IFTYPE_MONITOR:
+			/* monitor can't bridge anyway */
+			break;
+		case NL80211_IFTYPE_UNSPECIFIED:
+		case __NL80211_IFTYPE_AFTER_LAST:
+			/* not happening */
+			break;
+		}
+	}
+
 	return err;
 }
-- 
cgit v1.2.3


From 3d7a641e544e428191667e8b1f83f96fa46dbd65 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 19 Nov 2009 18:10:23 +0000
Subject: SLOW_WORK: Wait for outstanding work items belonging to a module to
 clear

Wait for outstanding slow work items belonging to a module to clear when
unregistering that module as a user of the facility.  This prevents the put_ref
code of a work item from being taken away before it returns.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/slow-work.txt |  13 ++++-
 fs/fscache/main.c           |   6 +-
 fs/fscache/object.c         |   1 +
 fs/fscache/operation.c      |   1 +
 fs/gfs2/main.c              |   4 +-
 fs/gfs2/recovery.c          |   1 +
 include/linux/slow-work.h   |   8 ++-
 kernel/slow-work.c          | 132 ++++++++++++++++++++++++++++++++++++++++++--
 8 files changed, 150 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
index ebc50f808ea4..f12fda31dcdc 100644
--- a/Documentation/slow-work.txt
+++ b/Documentation/slow-work.txt
@@ -64,9 +64,11 @@ USING SLOW WORK ITEMS
 Firstly, a module or subsystem wanting to make use of slow work items must
 register its interest:
 
-	 int ret = slow_work_register_user();
+	 int ret = slow_work_register_user(struct module *module);
 
-This will return 0 if successful, or a -ve error upon failure.
+This will return 0 if successful, or a -ve error upon failure.  The module
+pointer should be the module interested in using this facility (almost
+certainly THIS_MODULE).
 
 
 Slow work items may then be set up by:
@@ -110,7 +112,12 @@ operation.  When all a module's slow work items have been processed, and the
 module has no further interest in the facility, it should unregister its
 interest:
 
-	slow_work_unregister_user();
+	slow_work_unregister_user(struct module *module);
+
+The module pointer is used to wait for all outstanding work items for that
+module before completing the unregistration.  This prevents the put_ref() code
+from being taken away before it completes.  module should almost certainly be
+THIS_MODULE.
 
 
 ===============
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index 4de41b597499..add6bdb53f04 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -48,7 +48,7 @@ static int __init fscache_init(void)
 {
 	int ret;
 
-	ret = slow_work_register_user();
+	ret = slow_work_register_user(THIS_MODULE);
 	if (ret < 0)
 		goto error_slow_work;
 
@@ -80,7 +80,7 @@ error_kobj:
 error_cookie_jar:
 	fscache_proc_cleanup();
 error_proc:
-	slow_work_unregister_user();
+	slow_work_unregister_user(THIS_MODULE);
 error_slow_work:
 	return ret;
 }
@@ -97,7 +97,7 @@ static void __exit fscache_exit(void)
 	kobject_put(fscache_root);
 	kmem_cache_destroy(fscache_cookie_jar);
 	fscache_proc_cleanup();
-	slow_work_unregister_user();
+	slow_work_unregister_user(THIS_MODULE);
 	printk(KERN_NOTICE "FS-Cache: Unloaded\n");
 }
 
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 392a41b1b79d..d236eb1d6f37 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -45,6 +45,7 @@ static void fscache_enqueue_dependents(struct fscache_object *);
 static void fscache_dequeue_object(struct fscache_object *);
 
 const struct slow_work_ops fscache_object_slow_work_ops = {
+	.owner		= THIS_MODULE,
 	.get_ref	= fscache_object_slow_work_get_ref,
 	.put_ref	= fscache_object_slow_work_put_ref,
 	.execute	= fscache_object_slow_work_execute,
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index e7f8d53b8b6b..f1a2857b2ff5 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -453,6 +453,7 @@ static void fscache_op_execute(struct slow_work *work)
 }
 
 const struct slow_work_ops fscache_op_slow_work_ops = {
+	.owner		= THIS_MODULE,
 	.get_ref	= fscache_op_get_ref,
 	.put_ref	= fscache_op_put_ref,
 	.execute	= fscache_op_execute,
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index eacd78a5d082..5b31f7741a8f 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -114,7 +114,7 @@ static int __init init_gfs2_fs(void)
 	if (error)
 		goto fail_unregister;
 
-	error = slow_work_register_user();
+	error = slow_work_register_user(THIS_MODULE);
 	if (error)
 		goto fail_slow;
 
@@ -163,7 +163,7 @@ static void __exit exit_gfs2_fs(void)
 	gfs2_unregister_debugfs();
 	unregister_filesystem(&gfs2_fs_type);
 	unregister_filesystem(&gfs2meta_fs_type);
-	slow_work_unregister_user();
+	slow_work_unregister_user(THIS_MODULE);
 
 	kmem_cache_destroy(gfs2_quotad_cachep);
 	kmem_cache_destroy(gfs2_rgrpd_cachep);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 59d2695509d3..b2bb779f09ed 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -593,6 +593,7 @@ fail:
 }
 
 struct slow_work_ops gfs2_recover_ops = {
+	.owner	 = THIS_MODULE,
 	.get_ref = gfs2_recover_get_ref,
 	.put_ref = gfs2_recover_put_ref,
 	.execute = gfs2_recover_work,
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index b65c8881f07a..9adb2b30754f 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -24,6 +24,9 @@ struct slow_work;
  * The operations used to support slow work items
  */
 struct slow_work_ops {
+	/* owner */
+	struct module *owner;
+
 	/* get a ref on a work item
 	 * - return 0 if successful, -ve if not
 	 */
@@ -42,6 +45,7 @@ struct slow_work_ops {
  *   queued
  */
 struct slow_work {
+	struct module		*owner;	/* the owning module */
 	unsigned long		flags;
 #define SLOW_WORK_PENDING	0	/* item pending (further) execution */
 #define SLOW_WORK_EXECUTING	1	/* item currently executing */
@@ -84,8 +88,8 @@ static inline void vslow_work_init(struct slow_work *work,
 }
 
 extern int slow_work_enqueue(struct slow_work *work);
-extern int slow_work_register_user(void);
-extern void slow_work_unregister_user(void);
+extern int slow_work_register_user(struct module *owner);
+extern void slow_work_unregister_user(struct module *owner);
 
 #ifdef CONFIG_SYSCTL
 extern ctl_table slow_work_sysctls[];
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 0d31135efbf4..dd08f376e406 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -22,6 +22,8 @@
 #define SLOW_WORK_OOM_TIMEOUT (5 * HZ)	/* can't start new threads for 5s after
 					 * OOM */
 
+#define SLOW_WORK_THREAD_LIMIT	255	/* abs maximum number of slow-work threads */
+
 static void slow_work_cull_timeout(unsigned long);
 static void slow_work_oom_timeout(unsigned long);
 
@@ -46,7 +48,7 @@ static unsigned vslow_work_proportion = 50; /* % of threads that may process
 
 #ifdef CONFIG_SYSCTL
 static const int slow_work_min_min_threads = 2;
-static int slow_work_max_max_threads = 255;
+static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT;
 static const int slow_work_min_vslow = 1;
 static const int slow_work_max_vslow = 99;
 
@@ -97,6 +99,23 @@ static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
 static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
 static struct slow_work slow_work_new_thread; /* new thread starter */
 
+/*
+ * slow work ID allocation (use slow_work_queue_lock)
+ */
+static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
+
+/*
+ * Unregistration tracking to prevent put_ref() from disappearing during module
+ * unload
+ */
+#ifdef CONFIG_MODULES
+static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT];
+static struct module *slow_work_unreg_module;
+static struct slow_work *slow_work_unreg_work_item;
+static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq);
+static DEFINE_MUTEX(slow_work_unreg_sync_lock);
+#endif
+
 /*
  * The queues of work items and the lock governing access to them.  These are
  * shared between all the CPUs.  It doesn't make sense to have per-CPU queues
@@ -149,8 +168,11 @@ static unsigned slow_work_calc_vsmax(void)
  * Attempt to execute stuff queued on a slow thread.  Return true if we managed
  * it, false if there was nothing to do.
  */
-static bool slow_work_execute(void)
+static bool slow_work_execute(int id)
 {
+#ifdef CONFIG_MODULES
+	struct module *module;
+#endif
 	struct slow_work *work = NULL;
 	unsigned vsmax;
 	bool very_slow;
@@ -186,6 +208,12 @@ static bool slow_work_execute(void)
 	} else {
 		very_slow = false; /* avoid the compiler warning */
 	}
+
+#ifdef CONFIG_MODULES
+	if (work)
+		slow_work_thread_processing[id] = work->owner;
+#endif
+
 	spin_unlock_irq(&slow_work_queue_lock);
 
 	if (!work)
@@ -219,7 +247,18 @@ static bool slow_work_execute(void)
 		spin_unlock_irq(&slow_work_queue_lock);
 	}
 
+	/* sort out the race between module unloading and put_ref() */
 	work->ops->put_ref(work);
+
+#ifdef CONFIG_MODULES
+	module = slow_work_thread_processing[id];
+	slow_work_thread_processing[id] = NULL;
+	smp_mb();
+	if (slow_work_unreg_work_item == work ||
+	    slow_work_unreg_module == module)
+		wake_up_all(&slow_work_unreg_wq);
+#endif
+
 	return true;
 
 auto_requeue:
@@ -232,6 +271,7 @@ auto_requeue:
 	else
 		list_add_tail(&work->link, &slow_work_queue);
 	spin_unlock_irq(&slow_work_queue_lock);
+	slow_work_thread_processing[id] = NULL;
 	return true;
 }
 
@@ -368,13 +408,22 @@ static inline bool slow_work_available(int vsmax)
  */
 static int slow_work_thread(void *_data)
 {
-	int vsmax;
+	int vsmax, id;
 
 	DEFINE_WAIT(wait);
 
 	set_freezable();
 	set_user_nice(current, -5);
 
+	/* allocate ourselves an ID */
+	spin_lock_irq(&slow_work_queue_lock);
+	id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
+	BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT);
+	__set_bit(id, slow_work_ids);
+	spin_unlock_irq(&slow_work_queue_lock);
+
+	sprintf(current->comm, "kslowd%03u", id);
+
 	for (;;) {
 		vsmax = vslow_work_proportion;
 		vsmax *= atomic_read(&slow_work_thread_count);
@@ -395,7 +444,7 @@ static int slow_work_thread(void *_data)
 		vsmax *= atomic_read(&slow_work_thread_count);
 		vsmax /= 100;
 
-		if (slow_work_available(vsmax) && slow_work_execute()) {
+		if (slow_work_available(vsmax) && slow_work_execute(id)) {
 			cond_resched();
 			if (list_empty(&slow_work_queue) &&
 			    list_empty(&vslow_work_queue) &&
@@ -412,6 +461,10 @@ static int slow_work_thread(void *_data)
 			break;
 	}
 
+	spin_lock_irq(&slow_work_queue_lock);
+	__clear_bit(id, slow_work_ids);
+	spin_unlock_irq(&slow_work_queue_lock);
+
 	if (atomic_dec_and_test(&slow_work_thread_count))
 		complete_and_exit(&slow_work_last_thread_exited, 0);
 	return 0;
@@ -475,6 +528,7 @@ static void slow_work_new_thread_execute(struct slow_work *work)
 }
 
 static const struct slow_work_ops slow_work_new_thread_ops = {
+	.owner		= THIS_MODULE,
 	.get_ref	= slow_work_new_thread_get_ref,
 	.put_ref	= slow_work_new_thread_put_ref,
 	.execute	= slow_work_new_thread_execute,
@@ -546,12 +600,13 @@ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
 
 /**
  * slow_work_register_user - Register a user of the facility
+ * @module: The module about to make use of the facility
  *
  * Register a user of the facility, starting up the initial threads if there
  * aren't any other users at this point.  This will return 0 if successful, or
  * an error if not.
  */
-int slow_work_register_user(void)
+int slow_work_register_user(struct module *module)
 {
 	struct task_struct *p;
 	int loop;
@@ -598,14 +653,79 @@ error:
 }
 EXPORT_SYMBOL(slow_work_register_user);
 
+/*
+ * wait for all outstanding items from the calling module to complete
+ * - note that more items may be queued whilst we're waiting
+ */
+static void slow_work_wait_for_items(struct module *module)
+{
+	DECLARE_WAITQUEUE(myself, current);
+	struct slow_work *work;
+	int loop;
+
+	mutex_lock(&slow_work_unreg_sync_lock);
+	add_wait_queue(&slow_work_unreg_wq, &myself);
+
+	for (;;) {
+		spin_lock_irq(&slow_work_queue_lock);
+
+		/* first of all, we wait for the last queued item in each list
+		 * to be processed */
+		list_for_each_entry_reverse(work, &vslow_work_queue, link) {
+			if (work->owner == module) {
+				set_current_state(TASK_UNINTERRUPTIBLE);
+				slow_work_unreg_work_item = work;
+				goto do_wait;
+			}
+		}
+		list_for_each_entry_reverse(work, &slow_work_queue, link) {
+			if (work->owner == module) {
+				set_current_state(TASK_UNINTERRUPTIBLE);
+				slow_work_unreg_work_item = work;
+				goto do_wait;
+			}
+		}
+
+		/* then we wait for the items being processed to finish */
+		slow_work_unreg_module = module;
+		smp_mb();
+		for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) {
+			if (slow_work_thread_processing[loop] == module)
+				goto do_wait;
+		}
+		spin_unlock_irq(&slow_work_queue_lock);
+		break; /* okay, we're done */
+
+	do_wait:
+		spin_unlock_irq(&slow_work_queue_lock);
+		schedule();
+		slow_work_unreg_work_item = NULL;
+		slow_work_unreg_module = NULL;
+	}
+
+	remove_wait_queue(&slow_work_unreg_wq, &myself);
+	mutex_unlock(&slow_work_unreg_sync_lock);
+}
+
 /**
  * slow_work_unregister_user - Unregister a user of the facility
+ * @module: The module whose items should be cleared
  *
  * Unregister a user of the facility, killing all the threads if this was the
  * last one.
+ *
+ * This waits for all the work items belonging to the nominated module to go
+ * away before proceeding.
  */
-void slow_work_unregister_user(void)
+void slow_work_unregister_user(struct module *module)
 {
+	/* first of all, wait for all outstanding items from the calling module
+	 * to complete */
+	if (module)
+		slow_work_wait_for_items(module);
+
+	/* then we can actually go about shutting down the facility if need
+	 * be */
 	mutex_lock(&slow_work_user_lock);
 
 	BUG_ON(slow_work_user_count <= 0);
-- 
cgit v1.2.3


From 0160950297c08f8233c89b9f9e7dd59cfb080809 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 19 Nov 2009 18:10:43 +0000
Subject: SLOW_WORK: Add support for cancellation of slow work

Add support for cancellation of queued slow work and delayed slow work items.
The cancellation functions will wait for items that are pending or undergoing
execution to be discarded by the slow work facility.

Attempting to enqueue work that is in the process of being cancelled will
result in ECANCELED.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/slow-work.txt | 12 ++++++-
 include/linux/slow-work.h   |  2 ++
 kernel/slow-work.c          | 81 +++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 88 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
index c655c517fc68..2e384bd4dead 100644
--- a/Documentation/slow-work.txt
+++ b/Documentation/slow-work.txt
@@ -108,7 +108,17 @@ on the item, 0 otherwise.
 
 
 The items are reference counted, so there ought to be no need for a flush
-operation.  When all a module's slow work items have been processed, and the
+operation.  But as the reference counting is optional, means to cancel
+existing work items are also included:
+
+	cancel_slow_work(&myitem);
+
+can be used to cancel pending work.  The above cancel function waits for
+existing work to have been executed (or prevent execution of them, depending
+on timing).
+
+
+When all a module's slow work items have been processed, and the
 module has no further interest in the facility, it should unregister its
 interest:
 
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index 9adb2b30754f..eef20182d5b4 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -51,6 +51,7 @@ struct slow_work {
 #define SLOW_WORK_EXECUTING	1	/* item currently executing */
 #define SLOW_WORK_ENQ_DEFERRED	2	/* item enqueue deferred */
 #define SLOW_WORK_VERY_SLOW	3	/* item is very slow */
+#define SLOW_WORK_CANCELLING	4	/* item is being cancelled, don't enqueue */
 	const struct slow_work_ops *ops; /* operations table for this item */
 	struct list_head	link;	/* link in queue */
 };
@@ -88,6 +89,7 @@ static inline void vslow_work_init(struct slow_work *work,
 }
 
 extern int slow_work_enqueue(struct slow_work *work);
+extern void slow_work_cancel(struct slow_work *work);
 extern int slow_work_register_user(struct module *owner);
 extern void slow_work_unregister_user(struct module *owner);
 
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index fccf421eb5c1..671cc434532a 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -236,12 +236,17 @@ static bool slow_work_execute(int id)
 	if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
 		BUG();
 
-	work->ops->execute(work);
+	/* don't execute if the work is in the process of being cancelled */
+	if (!test_bit(SLOW_WORK_CANCELLING, &work->flags))
+		work->ops->execute(work);
 
 	if (very_slow)
 		atomic_dec(&vslow_work_executing_count);
 	clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
 
+	/* wake up anyone waiting for this work to be complete */
+	wake_up_bit(&work->flags, SLOW_WORK_EXECUTING);
+
 	/* if someone tried to enqueue the item whilst we were executing it,
 	 * then it'll be left unenqueued to avoid multiple threads trying to
 	 * execute it simultaneously
@@ -314,11 +319,16 @@ auto_requeue:
  * allowed to pick items to execute.  This ensures that very slow items won't
  * overly block ones that are just ordinarily slow.
  *
- * Returns 0 if successful, -EAGAIN if not.
+ * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is
+ * attempted queued)
  */
 int slow_work_enqueue(struct slow_work *work)
 {
 	unsigned long flags;
+	int ret;
+
+	if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
+		return -ECANCELED;
 
 	BUG_ON(slow_work_user_count <= 0);
 	BUG_ON(!work);
@@ -335,6 +345,9 @@ int slow_work_enqueue(struct slow_work *work)
 	if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
 		spin_lock_irqsave(&slow_work_queue_lock, flags);
 
+		if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags)))
+			goto cancelled;
+
 		/* we promise that we will not attempt to execute the work
 		 * function in more than one thread simultaneously
 		 *
@@ -352,8 +365,9 @@ int slow_work_enqueue(struct slow_work *work)
 		if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
 			set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
 		} else {
-			if (slow_work_get_ref(work) < 0)
-				goto cant_get_ref;
+			ret = slow_work_get_ref(work);
+			if (ret < 0)
+				goto failed;
 			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
 				list_add_tail(&work->link, &vslow_work_queue);
 			else
@@ -365,12 +379,67 @@ int slow_work_enqueue(struct slow_work *work)
 	}
 	return 0;
 
-cant_get_ref:
+cancelled:
+	ret = -ECANCELED;
+failed:
 	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-	return -EAGAIN;
+	return ret;
 }
 EXPORT_SYMBOL(slow_work_enqueue);
 
+static int slow_work_wait(void *word)
+{
+	schedule();
+	return 0;
+}
+
+/**
+ * slow_work_cancel - Cancel a slow work item
+ * @work: The work item to cancel
+ *
+ * This function will cancel a previously enqueued work item. If we cannot
+ * cancel the work item, it is guarenteed to have run when this function
+ * returns.
+ */
+void slow_work_cancel(struct slow_work *work)
+{
+	bool wait = true, put = false;
+
+	set_bit(SLOW_WORK_CANCELLING, &work->flags);
+
+	spin_lock_irq(&slow_work_queue_lock);
+
+	if (test_bit(SLOW_WORK_PENDING, &work->flags) &&
+	    !list_empty(&work->link)) {
+		/* the link in the pending queue holds a reference on the item
+		 * that we will need to release */
+		list_del_init(&work->link);
+		wait = false;
+		put = true;
+		clear_bit(SLOW_WORK_PENDING, &work->flags);
+
+	} else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) {
+		/* the executor is holding our only reference on the item, so
+		 * we merely need to wait for it to finish executing */
+		clear_bit(SLOW_WORK_PENDING, &work->flags);
+	}
+
+	spin_unlock_irq(&slow_work_queue_lock);
+
+	/* the EXECUTING flag is set by the executor whilst the spinlock is set
+	 * and before the item is dequeued - so assuming the above doesn't
+	 * actually dequeue it, simply waiting for the EXECUTING flag to be
+	 * released here should be sufficient */
+	if (wait)
+		wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait,
+			    TASK_UNINTERRUPTIBLE);
+
+	clear_bit(SLOW_WORK_CANCELLING, &work->flags);
+	if (put)
+		slow_work_put_ref(work);
+}
+EXPORT_SYMBOL(slow_work_cancel);
+
 /*
  * Schedule a cull of the thread pool at some time in the near future
  */
-- 
cgit v1.2.3


From 6b8268b17a1ffc942bc72d7d00274e433d6b6719 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 19 Nov 2009 18:10:47 +0000
Subject: SLOW_WORK: Add delayed_slow_work support

This adds support for starting slow work with a delay, similar
to the functionality we have for workqueues.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/slow-work.txt |  16 +++++-
 include/linux/slow-work.h   |  29 ++++++++++
 kernel/slow-work.c          | 129 +++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 171 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
index 2e384bd4dead..a9d1b0ffdded 100644
--- a/Documentation/slow-work.txt
+++ b/Documentation/slow-work.txt
@@ -41,6 +41,13 @@ expand files, provided the time taken to do so isn't too long.
 Operations of both types may sleep during execution, thus tying up the thread
 loaned to it.
 
+A further class of work item is available, based on the slow work item class:
+
+ (*) Delayed slow work items.
+
+These are slow work items that have a timer to defer queueing of the item for
+a while.
+
 
 THREAD-TO-CLASS ALLOCATION
 --------------------------
@@ -93,6 +100,10 @@ Slow work items may then be set up by:
 
 	slow_work_init(&myitem, &myitem_ops);
 
+     or:
+
+	delayed_slow_work_init(&myitem, &myitem_ops);
+
      or:
 
 	vslow_work_init(&myitem, &myitem_ops);
@@ -104,7 +115,9 @@ A suitably set up work item can then be enqueued for processing:
 	int ret = slow_work_enqueue(&myitem);
 
 This will return a -ve error if the thread pool is unable to gain a reference
-on the item, 0 otherwise.
+on the item, 0 otherwise, or (for delayed work):
+
+	int ret = delayed_slow_work_enqueue(&myitem, my_jiffy_delay);
 
 
 The items are reference counted, so there ought to be no need for a flush
@@ -112,6 +125,7 @@ operation.  But as the reference counting is optional, means to cancel
 existing work items are also included:
 
 	cancel_slow_work(&myitem);
+	cancel_delayed_slow_work(&myitem);
 
 can be used to cancel pending work.  The above cancel function waits for
 existing work to have been executed (or prevent execution of them, depending
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index eef20182d5b4..b245b9a9cc0b 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -17,6 +17,7 @@
 #ifdef CONFIG_SLOW_WORK
 
 #include <linux/sysctl.h>
+#include <linux/timer.h>
 
 struct slow_work;
 
@@ -52,10 +53,16 @@ struct slow_work {
 #define SLOW_WORK_ENQ_DEFERRED	2	/* item enqueue deferred */
 #define SLOW_WORK_VERY_SLOW	3	/* item is very slow */
 #define SLOW_WORK_CANCELLING	4	/* item is being cancelled, don't enqueue */
+#define SLOW_WORK_DELAYED	5	/* item is struct delayed_slow_work with active timer */
 	const struct slow_work_ops *ops; /* operations table for this item */
 	struct list_head	link;	/* link in queue */
 };
 
+struct delayed_slow_work {
+	struct slow_work	work;
+	struct timer_list	timer;
+};
+
 /**
  * slow_work_init - Initialise a slow work item
  * @work: The work item to initialise
@@ -71,6 +78,20 @@ static inline void slow_work_init(struct slow_work *work,
 	INIT_LIST_HEAD(&work->link);
 }
 
+/**
+ * slow_work_init - Initialise a delayed slow work item
+ * @work: The work item to initialise
+ * @ops: The operations to use to handle the slow work item
+ *
+ * Initialise a delayed slow work item.
+ */
+static inline void delayed_slow_work_init(struct delayed_slow_work *dwork,
+					  const struct slow_work_ops *ops)
+{
+	init_timer(&dwork->timer);
+	slow_work_init(&dwork->work, ops);
+}
+
 /**
  * vslow_work_init - Initialise a very slow work item
  * @work: The work item to initialise
@@ -93,6 +114,14 @@ extern void slow_work_cancel(struct slow_work *work);
 extern int slow_work_register_user(struct module *owner);
 extern void slow_work_unregister_user(struct module *owner);
 
+extern int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
+				     unsigned long delay);
+
+static inline void delayed_slow_work_cancel(struct delayed_slow_work *dwork)
+{
+	slow_work_cancel(&dwork->work);
+}
+
 #ifdef CONFIG_SYSCTL
 extern ctl_table slow_work_sysctls[];
 #endif
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 671cc434532a..f67e1daae93d 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -406,11 +406,40 @@ void slow_work_cancel(struct slow_work *work)
 	bool wait = true, put = false;
 
 	set_bit(SLOW_WORK_CANCELLING, &work->flags);
+	smp_mb();
+
+	/* if the work item is a delayed work item with an active timer, we
+	 * need to wait for the timer to finish _before_ getting the spinlock,
+	 * lest we deadlock against the timer routine
+	 *
+	 * the timer routine will leave DELAYED set if it notices the
+	 * CANCELLING flag in time
+	 */
+	if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
+		struct delayed_slow_work *dwork =
+			container_of(work, struct delayed_slow_work, work);
+		del_timer_sync(&dwork->timer);
+	}
 
 	spin_lock_irq(&slow_work_queue_lock);
 
-	if (test_bit(SLOW_WORK_PENDING, &work->flags) &&
-	    !list_empty(&work->link)) {
+	if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
+		/* the timer routine aborted or never happened, so we are left
+		 * holding the timer's reference on the item and should just
+		 * drop the pending flag and wait for any ongoing execution to
+		 * finish */
+		struct delayed_slow_work *dwork =
+			container_of(work, struct delayed_slow_work, work);
+
+		BUG_ON(timer_pending(&dwork->timer));
+		BUG_ON(!list_empty(&work->link));
+
+		clear_bit(SLOW_WORK_DELAYED, &work->flags);
+		put = true;
+		clear_bit(SLOW_WORK_PENDING, &work->flags);
+
+	} else if (test_bit(SLOW_WORK_PENDING, &work->flags) &&
+		   !list_empty(&work->link)) {
 		/* the link in the pending queue holds a reference on the item
 		 * that we will need to release */
 		list_del_init(&work->link);
@@ -440,6 +469,102 @@ void slow_work_cancel(struct slow_work *work)
 }
 EXPORT_SYMBOL(slow_work_cancel);
 
+/*
+ * Handle expiry of the delay timer, indicating that a delayed slow work item
+ * should now be queued if not cancelled
+ */
+static void delayed_slow_work_timer(unsigned long data)
+{
+	struct slow_work *work = (struct slow_work *) data;
+	unsigned long flags;
+	bool queued = false, put = false;
+
+	spin_lock_irqsave(&slow_work_queue_lock, flags);
+	if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) {
+		clear_bit(SLOW_WORK_DELAYED, &work->flags);
+
+		if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
+			/* we discard the reference the timer was holding in
+			 * favour of the one the executor holds */
+			set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
+			put = true;
+		} else {
+			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
+				list_add_tail(&work->link, &vslow_work_queue);
+			else
+				list_add_tail(&work->link, &slow_work_queue);
+			queued = true;
+		}
+	}
+
+	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+	if (put)
+		slow_work_put_ref(work);
+	if (queued)
+		wake_up(&slow_work_thread_wq);
+}
+
+/**
+ * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing
+ * @dwork: The delayed work item to queue
+ * @delay: When to start executing the work, in jiffies from now
+ *
+ * This is similar to slow_work_enqueue(), but it adds a delay before the work
+ * is actually queued for processing.
+ *
+ * The item can have delayed processing requested on it whilst it is being
+ * executed.  The delay will begin immediately, and if it expires before the
+ * item finishes executing, the item will be placed back on the queue when it
+ * has done executing.
+ */
+int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
+			      unsigned long delay)
+{
+	struct slow_work *work = &dwork->work;
+	unsigned long flags;
+	int ret;
+
+	if (delay == 0)
+		return slow_work_enqueue(&dwork->work);
+
+	BUG_ON(slow_work_user_count <= 0);
+	BUG_ON(!work);
+	BUG_ON(!work->ops);
+
+	if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
+		return -ECANCELED;
+
+	if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
+		spin_lock_irqsave(&slow_work_queue_lock, flags);
+
+		if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
+			goto cancelled;
+
+		/* the timer holds a reference whilst it is pending */
+		ret = work->ops->get_ref(work);
+		if (ret < 0)
+			goto cant_get_ref;
+
+		if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags))
+			BUG();
+		dwork->timer.expires = jiffies + delay;
+		dwork->timer.data = (unsigned long) work;
+		dwork->timer.function = delayed_slow_work_timer;
+		add_timer(&dwork->timer);
+
+		spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+	}
+
+	return 0;
+
+cancelled:
+	ret = -ECANCELED;
+cant_get_ref:
+	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(delayed_slow_work_enqueue);
+
 /*
  * Schedule a cull of the thread pool at some time in the near future
  */
-- 
cgit v1.2.3


From 8fba10a42d191de612e60e7009c8f0313f90a9b3 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 19 Nov 2009 18:10:51 +0000
Subject: SLOW_WORK: Allow the work items to be viewed through a /proc file

Allow the executing and queued work items to be viewed through a /proc file
for debugging purposes.  The contents look something like the following:

    THR PID   ITEM ADDR        FL MARK  DESC
    === ===== ================ == ===== ==========
      0  3005 ffff880023f52348  a 952ms FSC: OBJ17d3: LOOK
      1  3006 ffff880024e33668  2 160ms FSC: OBJ17e5 OP60d3b: Write1/Store fl=2
      2  3165 ffff8800296dd180  a 424ms FSC: OBJ17e4: LOOK
      3  4089 ffff8800262c8d78  a 212ms FSC: OBJ17ea: CRTN
      4  4090 ffff88002792bed8  2 388ms FSC: OBJ17e8 OP60d36: Write1/Store fl=2
      5  4092 ffff88002a0ef308  2 388ms FSC: OBJ17e7 OP60d2e: Write1/Store fl=2
      6  4094 ffff88002abaf4b8  2 132ms FSC: OBJ17e2 OP60d4e: Write1/Store fl=2
      7  4095 ffff88002bb188e0  a 388ms FSC: OBJ17e9: CRTN
    vsq     - ffff880023d99668  1 308ms FSC: OBJ17e0 OP60f91: Write1/EnQ fl=2
    vsq     - ffff8800295d1740  1 212ms FSC: OBJ16be OP4d4b6: Write1/EnQ fl=2
    vsq     - ffff880025ba3308  1 160ms FSC: OBJ179a OP58dec: Write1/EnQ fl=2
    vsq     - ffff880024ec83e0  1 160ms FSC: OBJ17ae OP599f2: Write1/EnQ fl=2
    vsq     - ffff880026618e00  1 160ms FSC: OBJ17e6 OP60d33: Write1/EnQ fl=2
    vsq     - ffff880025a2a4b8  1 132ms FSC: OBJ16a2 OP4d583: Write1/EnQ fl=2
    vsq     - ffff880023cbe6d8  9 212ms FSC: OBJ17eb: LOOK
    vsq     - ffff880024d37590  9 212ms FSC: OBJ17ec: LOOK
    vsq     - ffff880027746cb0  9 212ms FSC: OBJ17ed: LOOK
    vsq     - ffff880024d37ae8  9 212ms FSC: OBJ17ee: LOOK
    vsq     - ffff880024d37cb0  9 212ms FSC: OBJ17ef: LOOK
    vsq     - ffff880025036550  9 212ms FSC: OBJ17f0: LOOK
    vsq     - ffff8800250368e0  9 212ms FSC: OBJ17f1: LOOK
    vsq     - ffff880025036aa8  9 212ms FSC: OBJ17f2: LOOK

In the 'THR' column, executing items show the thread they're occupying and
queued threads indicate which queue they're on.  'PID' shows the process ID of
a slow-work thread that's executing something.  'FL' shows the work item flags.
'MARK' indicates how long since an item was queued or began executing.  Lastly,
the 'DESC' column permits the owner of an item to give some information.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/slow-work.txt |  60 +++++++++++-
 include/linux/slow-work.h   |  11 +++
 init/Kconfig                |  10 ++
 kernel/Makefile             |   1 +
 kernel/slow-work-proc.c     | 227 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/slow-work.c          |  44 ++++++---
 kernel/slow-work.h          |  72 ++++++++++++++
 7 files changed, 413 insertions(+), 12 deletions(-)
 create mode 100644 kernel/slow-work-proc.c
 create mode 100644 kernel/slow-work.h

(limited to 'include/linux')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
index a9d1b0ffdded..f120238e70fe 100644
--- a/Documentation/slow-work.txt
+++ b/Documentation/slow-work.txt
@@ -149,7 +149,8 @@ ITEM OPERATIONS
 ===============
 
 Each work item requires a table of operations of type struct slow_work_ops.
-Only ->execute() is required, getting and putting of a reference are optional.
+Only ->execute() is required; the getting and putting of a reference and the
+describing of an item are all optional.
 
  (*) Get a reference on an item:
 
@@ -179,6 +180,16 @@ Only ->execute() is required, getting and putting of a reference are optional.
      This should perform the work required of the item.  It may sleep, it may
      perform disk I/O and it may wait for locks.
 
+ (*) View an item through /proc:
+
+	void (*desc)(struct slow_work *work, struct seq_file *m);
+
+     If supplied, this should print to 'm' a small string describing the work
+     the item is to do.  This should be no more than about 40 characters, and
+     shouldn't include a newline character.
+
+     See the 'Viewing executing and queued items' section below.
+
 
 ==================
 POOL CONFIGURATION
@@ -203,3 +214,50 @@ The slow-work thread pool has a number of configurables:
      is bounded to between 1 and one fewer than the number of active threads.
      This ensures there is always at least one thread that can process very
      slow work items, and always at least one thread that won't.
+
+
+==================================
+VIEWING EXECUTING AND QUEUED ITEMS
+==================================
+
+If CONFIG_SLOW_WORK_PROC is enabled, a proc file is made available:
+
+	/proc/slow_work_rq
+
+through which the list of work items being executed and the queues of items to
+be executed may be viewed.  The owner of a work item is given the chance to
+add some information of its own.
+
+The contents look something like the following:
+
+    THR PID   ITEM ADDR        FL MARK  DESC
+    === ===== ================ == ===== ==========
+      0  3005 ffff880023f52348  a 952ms FSC: OBJ17d3: LOOK
+      1  3006 ffff880024e33668  2 160ms FSC: OBJ17e5 OP60d3b: Write1/Store fl=2
+      2  3165 ffff8800296dd180  a 424ms FSC: OBJ17e4: LOOK
+      3  4089 ffff8800262c8d78  a 212ms FSC: OBJ17ea: CRTN
+      4  4090 ffff88002792bed8  2 388ms FSC: OBJ17e8 OP60d36: Write1/Store fl=2
+      5  4092 ffff88002a0ef308  2 388ms FSC: OBJ17e7 OP60d2e: Write1/Store fl=2
+      6  4094 ffff88002abaf4b8  2 132ms FSC: OBJ17e2 OP60d4e: Write1/Store fl=2
+      7  4095 ffff88002bb188e0  a 388ms FSC: OBJ17e9: CRTN
+    vsq     - ffff880023d99668  1 308ms FSC: OBJ17e0 OP60f91: Write1/EnQ fl=2
+    vsq     - ffff8800295d1740  1 212ms FSC: OBJ16be OP4d4b6: Write1/EnQ fl=2
+    vsq     - ffff880025ba3308  1 160ms FSC: OBJ179a OP58dec: Write1/EnQ fl=2
+    vsq     - ffff880024ec83e0  1 160ms FSC: OBJ17ae OP599f2: Write1/EnQ fl=2
+    vsq     - ffff880026618e00  1 160ms FSC: OBJ17e6 OP60d33: Write1/EnQ fl=2
+    vsq     - ffff880025a2a4b8  1 132ms FSC: OBJ16a2 OP4d583: Write1/EnQ fl=2
+    vsq     - ffff880023cbe6d8  9 212ms FSC: OBJ17eb: LOOK
+    vsq     - ffff880024d37590  9 212ms FSC: OBJ17ec: LOOK
+    vsq     - ffff880027746cb0  9 212ms FSC: OBJ17ed: LOOK
+    vsq     - ffff880024d37ae8  9 212ms FSC: OBJ17ee: LOOK
+    vsq     - ffff880024d37cb0  9 212ms FSC: OBJ17ef: LOOK
+    vsq     - ffff880025036550  9 212ms FSC: OBJ17f0: LOOK
+    vsq     - ffff8800250368e0  9 212ms FSC: OBJ17f1: LOOK
+    vsq     - ffff880025036aa8  9 212ms FSC: OBJ17f2: LOOK
+
+In the 'THR' column, executing items show the thread they're occupying and
+queued threads indicate which queue they're on.  'PID' shows the process ID of
+a slow-work thread that's executing something.  'FL' shows the work item flags.
+'MARK' indicates how long since an item was queued or began executing.  Lastly,
+the 'DESC' column permits the owner of an item to give some information.
+
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index b245b9a9cc0b..f41485145ed1 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -20,6 +20,9 @@
 #include <linux/timer.h>
 
 struct slow_work;
+#ifdef CONFIG_SLOW_WORK_PROC
+struct seq_file;
+#endif
 
 /*
  * The operations used to support slow work items
@@ -38,6 +41,11 @@ struct slow_work_ops {
 
 	/* execute a work item */
 	void (*execute)(struct slow_work *work);
+
+#ifdef CONFIG_SLOW_WORK_PROC
+	/* describe a work item for /proc */
+	void (*desc)(struct slow_work *work, struct seq_file *m);
+#endif
 };
 
 /*
@@ -56,6 +64,9 @@ struct slow_work {
 #define SLOW_WORK_DELAYED	5	/* item is struct delayed_slow_work with active timer */
 	const struct slow_work_ops *ops; /* operations table for this item */
 	struct list_head	link;	/* link in queue */
+#ifdef CONFIG_SLOW_WORK_PROC
+	struct timespec		mark;	/* jiffies at which queued or exec begun */
+#endif
 };
 
 struct delayed_slow_work {
diff --git a/init/Kconfig b/init/Kconfig
index 9e03ef8b311e..ab5c64801fe5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1098,6 +1098,16 @@ config SLOW_WORK
 
 	  See Documentation/slow-work.txt.
 
+config SLOW_WORK_PROC
+	bool "Slow work debugging through /proc"
+	default n
+	depends on SLOW_WORK && PROC_FS
+	help
+	  Display the contents of the slow work run queue through /proc,
+	  including items currently executing.
+
+	  See Documentation/slow-work.txt.
+
 endmenu		# General setup
 
 config HAVE_GENERIC_DMA_COHERENT
diff --git a/kernel/Makefile b/kernel/Makefile
index b8d4cd8ac0b9..776ffed1556d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -94,6 +94,7 @@ obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_SLOW_WORK) += slow-work.o
+obj-$(CONFIG_SLOW_WORK_PROC) += slow-work-proc.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
diff --git a/kernel/slow-work-proc.c b/kernel/slow-work-proc.c
new file mode 100644
index 000000000000..3988032571f5
--- /dev/null
+++ b/kernel/slow-work-proc.c
@@ -0,0 +1,227 @@
+/* Slow work debugging
+ *
+ * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/slow-work.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/seq_file.h>
+#include "slow-work.h"
+
+#define ITERATOR_SHIFT		(BITS_PER_LONG - 4)
+#define ITERATOR_SELECTOR	(0xfUL << ITERATOR_SHIFT)
+#define ITERATOR_COUNTER	(~ITERATOR_SELECTOR)
+
+void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m)
+{
+	seq_puts(m, "Slow-work: New thread");
+}
+
+/*
+ * Render the time mark field on a work item into a 5-char time with units plus
+ * a space
+ */
+static void slow_work_print_mark(struct seq_file *m, struct slow_work *work)
+{
+	struct timespec now, diff;
+
+	now = CURRENT_TIME;
+	diff = timespec_sub(now, work->mark);
+
+	if (diff.tv_sec < 0)
+		seq_puts(m, "  -ve ");
+	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000)
+		seq_printf(m, "%3luns ", diff.tv_nsec);
+	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000)
+		seq_printf(m, "%3luus ", diff.tv_nsec / 1000);
+	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000)
+		seq_printf(m, "%3lums ", diff.tv_nsec / 1000000);
+	else if (diff.tv_sec <= 1)
+		seq_puts(m, "   1s ");
+	else if (diff.tv_sec < 60)
+		seq_printf(m, "%4lus ", diff.tv_sec);
+	else if (diff.tv_sec < 60 * 60)
+		seq_printf(m, "%4lum ", diff.tv_sec / 60);
+	else if (diff.tv_sec < 60 * 60 * 24)
+		seq_printf(m, "%4luh ", diff.tv_sec / 3600);
+	else
+		seq_puts(m, "exces ");
+}
+
+/*
+ * Describe a slow work item for /proc
+ */
+static int slow_work_runqueue_show(struct seq_file *m, void *v)
+{
+	struct slow_work *work;
+	struct list_head *p = v;
+	unsigned long id;
+
+	switch ((unsigned long) v) {
+	case 1:
+		seq_puts(m, "THR PID   ITEM ADDR        FL MARK  DESC\n");
+		return 0;
+	case 2:
+		seq_puts(m, "=== ===== ================ == ===== ==========\n");
+		return 0;
+
+	case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1:
+		id = (unsigned long) v - 3;
+
+		read_lock(&slow_work_execs_lock);
+		work = slow_work_execs[id];
+		if (work) {
+			smp_read_barrier_depends();
+
+			seq_printf(m, "%3lu %5d %16p %2lx ",
+				   id, slow_work_pids[id], work, work->flags);
+			slow_work_print_mark(m, work);
+
+			if (work->ops->desc)
+				work->ops->desc(work, m);
+			seq_putc(m, '\n');
+		}
+		read_unlock(&slow_work_execs_lock);
+		return 0;
+
+	default:
+		work = list_entry(p, struct slow_work, link);
+		seq_printf(m, "%3s     - %16p %2lx ",
+			   work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq",
+			   work, work->flags);
+		slow_work_print_mark(m, work);
+
+		if (work->ops->desc)
+			work->ops->desc(work, m);
+		seq_putc(m, '\n');
+		return 0;
+	}
+}
+
+/*
+ * map the iterator to a work item
+ */
+static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos)
+{
+	struct list_head *p;
+	unsigned long count, id;
+
+	switch (*_pos >> ITERATOR_SHIFT) {
+	case 0x0:
+		if (*_pos == 0)
+			*_pos = 1;
+		if (*_pos < 3)
+			return (void *)(unsigned long) *_pos;
+		if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT)
+			for (id = *_pos - 3;
+			     id < SLOW_WORK_THREAD_LIMIT;
+			     id++, (*_pos)++)
+				if (slow_work_execs[id])
+					return (void *)(unsigned long) *_pos;
+		*_pos = 0x1UL << ITERATOR_SHIFT;
+
+	case 0x1:
+		count = *_pos & ITERATOR_COUNTER;
+		list_for_each(p, &slow_work_queue) {
+			if (count == 0)
+				return p;
+			count--;
+		}
+		*_pos = 0x2UL << ITERATOR_SHIFT;
+
+	case 0x2:
+		count = *_pos & ITERATOR_COUNTER;
+		list_for_each(p, &vslow_work_queue) {
+			if (count == 0)
+				return p;
+			count--;
+		}
+		*_pos = 0x3UL << ITERATOR_SHIFT;
+
+	default:
+		return NULL;
+	}
+}
+
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos)
+{
+	spin_lock_irq(&slow_work_queue_lock);
+	return slow_work_runqueue_index(m, _pos);
+}
+
+/*
+ * move to the next line
+ */
+static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos)
+{
+	struct list_head *p = v;
+	unsigned long selector = *_pos >> ITERATOR_SHIFT;
+
+	(*_pos)++;
+	switch (selector) {
+	case 0x0:
+		return slow_work_runqueue_index(m, _pos);
+
+	case 0x1:
+		if (*_pos >> ITERATOR_SHIFT == 0x1) {
+			p = p->next;
+			if (p != &slow_work_queue)
+				return p;
+		}
+		*_pos = 0x2UL << ITERATOR_SHIFT;
+		p = &vslow_work_queue;
+
+	case 0x2:
+		if (*_pos >> ITERATOR_SHIFT == 0x2) {
+			p = p->next;
+			if (p != &vslow_work_queue)
+				return p;
+		}
+		*_pos = 0x3UL << ITERATOR_SHIFT;
+
+	default:
+		return NULL;
+	}
+}
+
+/*
+ * clean up after reading
+ */
+static void slow_work_runqueue_stop(struct seq_file *m, void *v)
+{
+	spin_unlock_irq(&slow_work_queue_lock);
+}
+
+static const struct seq_operations slow_work_runqueue_ops = {
+	.start		= slow_work_runqueue_start,
+	.stop		= slow_work_runqueue_stop,
+	.next		= slow_work_runqueue_next,
+	.show		= slow_work_runqueue_show,
+};
+
+/*
+ * open "/proc/slow_work_rq" to list queue contents
+ */
+static int slow_work_runqueue_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &slow_work_runqueue_ops);
+}
+
+const struct file_operations slow_work_runqueue_fops = {
+	.owner		= THIS_MODULE,
+	.open		= slow_work_runqueue_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index f67e1daae93d..b763bc2d2670 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -16,13 +16,8 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/wait.h>
-
-#define SLOW_WORK_CULL_TIMEOUT (5 * HZ)	/* cull threads 5s after running out of
-					 * things to do */
-#define SLOW_WORK_OOM_TIMEOUT (5 * HZ)	/* can't start new threads for 5s after
-					 * OOM */
-
-#define SLOW_WORK_THREAD_LIMIT	255	/* abs maximum number of slow-work threads */
+#include <linux/proc_fs.h>
+#include "slow-work.h"
 
 static void slow_work_cull_timeout(unsigned long);
 static void slow_work_oom_timeout(unsigned long);
@@ -116,6 +111,15 @@ static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq);
 static DEFINE_MUTEX(slow_work_unreg_sync_lock);
 #endif
 
+/*
+ * Data for tracking currently executing items for indication through /proc
+ */
+#ifdef CONFIG_SLOW_WORK_PROC
+struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT];
+pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT];
+DEFINE_RWLOCK(slow_work_execs_lock);
+#endif
+
 /*
  * The queues of work items and the lock governing access to them.  These are
  * shared between all the CPUs.  It doesn't make sense to have per-CPU queues
@@ -124,9 +128,9 @@ static DEFINE_MUTEX(slow_work_unreg_sync_lock);
  * There are two queues of work items: one for slow work items, and one for
  * very slow work items.
  */
-static LIST_HEAD(slow_work_queue);
-static LIST_HEAD(vslow_work_queue);
-static DEFINE_SPINLOCK(slow_work_queue_lock);
+LIST_HEAD(slow_work_queue);
+LIST_HEAD(vslow_work_queue);
+DEFINE_SPINLOCK(slow_work_queue_lock);
 
 /*
  * The thread controls.  A variable used to signal to the threads that they
@@ -182,7 +186,7 @@ static unsigned slow_work_calc_vsmax(void)
  * Attempt to execute stuff queued on a slow thread.  Return true if we managed
  * it, false if there was nothing to do.
  */
-static bool slow_work_execute(int id)
+static noinline bool slow_work_execute(int id)
 {
 #ifdef CONFIG_MODULES
 	struct module *module;
@@ -227,6 +231,10 @@ static bool slow_work_execute(int id)
 	if (work)
 		slow_work_thread_processing[id] = work->owner;
 #endif
+	if (work) {
+		slow_work_mark_time(work);
+		slow_work_begin_exec(id, work);
+	}
 
 	spin_unlock_irq(&slow_work_queue_lock);
 
@@ -247,6 +255,8 @@ static bool slow_work_execute(int id)
 	/* wake up anyone waiting for this work to be complete */
 	wake_up_bit(&work->flags, SLOW_WORK_EXECUTING);
 
+	slow_work_end_exec(id, work);
+
 	/* if someone tried to enqueue the item whilst we were executing it,
 	 * then it'll be left unenqueued to avoid multiple threads trying to
 	 * execute it simultaneously
@@ -285,6 +295,7 @@ auto_requeue:
 	 * - we transfer our ref on the item back to the appropriate queue
 	 * - don't wake another thread up as we're awake already
 	 */
+	slow_work_mark_time(work);
 	if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
 		list_add_tail(&work->link, &vslow_work_queue);
 	else
@@ -368,6 +379,7 @@ int slow_work_enqueue(struct slow_work *work)
 			ret = slow_work_get_ref(work);
 			if (ret < 0)
 				goto failed;
+			slow_work_mark_time(work);
 			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
 				list_add_tail(&work->link, &vslow_work_queue);
 			else
@@ -489,6 +501,7 @@ static void delayed_slow_work_timer(unsigned long data)
 			set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
 			put = true;
 		} else {
+			slow_work_mark_time(work);
 			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
 				list_add_tail(&work->link, &vslow_work_queue);
 			else
@@ -627,6 +640,7 @@ static int slow_work_thread(void *_data)
 	id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
 	BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT);
 	__set_bit(id, slow_work_ids);
+	slow_work_set_thread_pid(id, current->pid);
 	spin_unlock_irq(&slow_work_queue_lock);
 
 	sprintf(current->comm, "kslowd%03u", id);
@@ -669,6 +683,7 @@ static int slow_work_thread(void *_data)
 	}
 
 	spin_lock_irq(&slow_work_queue_lock);
+	slow_work_set_thread_pid(id, 0);
 	__clear_bit(id, slow_work_ids);
 	spin_unlock_irq(&slow_work_queue_lock);
 
@@ -722,6 +737,9 @@ static void slow_work_new_thread_execute(struct slow_work *work)
 static const struct slow_work_ops slow_work_new_thread_ops = {
 	.owner		= THIS_MODULE,
 	.execute	= slow_work_new_thread_execute,
+#ifdef CONFIG_SLOW_WORK_PROC
+	.desc		= slow_work_new_thread_desc,
+#endif
 };
 
 /*
@@ -948,6 +966,10 @@ static int __init init_slow_work(void)
 #ifdef CONFIG_SYSCTL
 	if (slow_work_max_max_threads < nr_cpus * 2)
 		slow_work_max_max_threads = nr_cpus * 2;
+#endif
+#ifdef CONFIG_SLOW_WORK_PROC
+	proc_create("slow_work_rq", S_IFREG | 0400, NULL,
+		    &slow_work_runqueue_fops);
 #endif
 	return 0;
 }
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
new file mode 100644
index 000000000000..3c2f007f3ad6
--- /dev/null
+++ b/kernel/slow-work.h
@@ -0,0 +1,72 @@
+/* Slow work private definitions
+ *
+ * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define SLOW_WORK_CULL_TIMEOUT (5 * HZ)	/* cull threads 5s after running out of
+					 * things to do */
+#define SLOW_WORK_OOM_TIMEOUT (5 * HZ)	/* can't start new threads for 5s after
+					 * OOM */
+
+#define SLOW_WORK_THREAD_LIMIT	255	/* abs maximum number of slow-work threads */
+
+/*
+ * slow-work.c
+ */
+#ifdef CONFIG_SLOW_WORK_PROC
+extern struct slow_work *slow_work_execs[];
+extern pid_t slow_work_pids[];
+extern rwlock_t slow_work_execs_lock;
+#endif
+
+extern struct list_head slow_work_queue;
+extern struct list_head vslow_work_queue;
+extern spinlock_t slow_work_queue_lock;
+
+/*
+ * slow-work-proc.c
+ */
+#ifdef CONFIG_SLOW_WORK_PROC
+extern const struct file_operations slow_work_runqueue_fops;
+
+extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
+#endif
+
+/*
+ * Helper functions
+ */
+static inline void slow_work_set_thread_pid(int id, pid_t pid)
+{
+#ifdef CONFIG_SLOW_WORK_PROC
+	slow_work_pids[id] = pid;
+#endif
+}
+
+static inline void slow_work_mark_time(struct slow_work *work)
+{
+#ifdef CONFIG_SLOW_WORK_PROC
+	work->mark = CURRENT_TIME;
+#endif
+}
+
+static inline void slow_work_begin_exec(int id, struct slow_work *work)
+{
+#ifdef CONFIG_SLOW_WORK_PROC
+	slow_work_execs[id] = work;
+#endif
+}
+
+static inline void slow_work_end_exec(int id, struct slow_work *work)
+{
+#ifdef CONFIG_SLOW_WORK_PROC
+	write_lock(&slow_work_execs_lock);
+	slow_work_execs[id] = NULL;
+	write_unlock(&slow_work_execs_lock);
+#endif
+}
-- 
cgit v1.2.3


From 31ba99d304494cb28fa8671ccc769c5543e1165d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 19 Nov 2009 18:10:53 +0000
Subject: SLOW_WORK: Allow the owner of a work item to determine if it is
 queued or not

Add a function (slow_work_is_queued()) to permit the owner of a work item to
determine if the item is queued or not.

The work item is counted as being queued if it is actually on the queue, not
just if it is pending.  If it is executing and pending, then it is not on the
queue, but will rather be put back on the queue when execution finishes.

This permits a caller to quickly work out if it may be able to put another,
dependent work item on the queue behind it, or whether it will have to wait
till that is finished.

This can be used by CacheFiles to work out whether the creation a new object
can be immediately deferred when it has to wait for an old object to be
deleted, or whether a wait must take place.  If a wait is necessary, then the
slow-work thread can otherwise get blocked, preventing the deletion from
taking place.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/slow-work.txt | 15 +++++++++++++++
 include/linux/slow-work.h   | 19 +++++++++++++++++++
 2 files changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
index f120238e70fe..0169c9d9dd16 100644
--- a/Documentation/slow-work.txt
+++ b/Documentation/slow-work.txt
@@ -144,6 +144,21 @@ from being taken away before it completes.  module should almost certainly be
 THIS_MODULE.
 
 
+================
+HELPER FUNCTIONS
+================
+
+The slow-work facility provides a function by which it can be determined
+whether or not an item is queued for later execution:
+
+	bool queued = slow_work_is_queued(struct slow_work *work);
+
+If it returns false, then the item is not on the queue (it may be executing
+with a requeue pending).  This can be used to work out whether an item on which
+another depends is on the queue, thus allowing a dependent item to be queued
+after it.
+
+
 ===============
 ITEM OPERATIONS
 ===============
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index f41485145ed1..bfd3ab4c8898 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -120,6 +120,25 @@ static inline void vslow_work_init(struct slow_work *work,
 	INIT_LIST_HEAD(&work->link);
 }
 
+/**
+ * slow_work_is_queued - Determine if a slow work item is on the work queue
+ * work: The work item to test
+ *
+ * Determine if the specified slow-work item is on the work queue.  This
+ * returns true if it is actually on the queue.
+ *
+ * If the item is executing and has been marked for requeue when execution
+ * finishes, then false will be returned.
+ *
+ * Anyone wishing to wait for completion of execution can wait on the
+ * SLOW_WORK_EXECUTING bit.
+ */
+static inline bool slow_work_is_queued(struct slow_work *work)
+{
+	unsigned long flags = work->flags;
+	return flags & SLOW_WORK_PENDING && !(flags & SLOW_WORK_EXECUTING);
+}
+
 extern int slow_work_enqueue(struct slow_work *work);
 extern void slow_work_cancel(struct slow_work *work);
 extern int slow_work_register_user(struct module *owner);
-- 
cgit v1.2.3


From 3bde31a4ac225cb5805be02eff6eaaf7e0766ccd Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 19 Nov 2009 18:10:57 +0000
Subject: SLOW_WORK: Allow a requeueable work item to sleep till the thread is
 needed

Add a function to allow a requeueable work item to sleep till the thread
processing it is needed by the slow-work facility to perform other work.

Sometimes a work item can't progress immediately, but must wait for the
completion of another work item that's currently being processed by another
slow-work thread.

In some circumstances, the waiting item could instead - theoretically - put
itself back on the queue and yield its thread back to the slow-work facility,
thus waiting till it gets processing time again before attempting to progress.
This would allow other work items processing time on that thread.

However, this only works if there is something on the queue for it to queue
behind - otherwise it will just get a thread again immediately, and will end
up cycling between the queue and the thread, eating up valuable CPU time.

So, slow_work_sleep_till_thread_needed() is provided such that an item can put
itself on a wait queue that will wake it up when the event it is actually
interested in occurs, then call this function in lieu of calling schedule().

This function will then sleep until either the item's event occurs or another
work item appears on the queue.  If another work item is queued, but the
item's event hasn't occurred, then the work item should requeue itself and
yield the thread back to the slow-work facility by returning.

This can be used by CacheFiles for an object that is being created on one
thread to wait for an object being deleted on another thread where there is
nothing on the queue for the creation to go and wait behind.  As soon as an
item appears on the queue that could be given thread time instead, CacheFiles
can stick the creating object back on the queue and return to the slow-work
facility - assuming the object deletion didn't also complete.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/slow-work.txt | 44 +++++++++++++++++++++
 include/linux/slow-work.h   |  3 ++
 kernel/slow-work.c          | 94 ++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 132 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
index 0169c9d9dd16..52bc31433723 100644
--- a/Documentation/slow-work.txt
+++ b/Documentation/slow-work.txt
@@ -158,6 +158,50 @@ with a requeue pending).  This can be used to work out whether an item on which
 another depends is on the queue, thus allowing a dependent item to be queued
 after it.
 
+If the above shows an item on which another depends not to be queued, then the
+owner of the dependent item might need to wait.  However, to avoid locking up
+the threads unnecessarily be sleeping in them, it can make sense under some
+circumstances to return the work item to the queue, thus deferring it until
+some other items have had a chance to make use of the yielded thread.
+
+To yield a thread and defer an item, the work function should simply enqueue
+the work item again and return.  However, this doesn't work if there's nothing
+actually on the queue, as the thread just vacated will jump straight back into
+the item's work function, thus busy waiting on a CPU.
+
+Instead, the item should use the thread to wait for the dependency to go away,
+but rather than using schedule() or schedule_timeout() to sleep, it should use
+the following function:
+
+	bool requeue = slow_work_sleep_till_thread_needed(
+			struct slow_work *work,
+			signed long *_timeout);
+
+This will add a second wait and then sleep, such that it will be woken up if
+either something appears on the queue that could usefully make use of the
+thread - and behind which this item can be queued, or if the event the caller
+set up to wait for happens.  True will be returned if something else appeared
+on the queue and this work function should perhaps return, of false if
+something else woke it up.  The timeout is as for schedule_timeout().
+
+For example:
+
+	wq = bit_waitqueue(&my_flags, MY_BIT);
+	init_wait(&wait);
+	requeue = false;
+	do {
+		prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+		if (!test_bit(MY_BIT, &my_flags))
+			break;
+		requeue = slow_work_sleep_till_thread_needed(&my_work,
+							     &timeout);
+	} while (timeout > 0 && !requeue);
+	finish_wait(wq, &wait);
+	if (!test_bit(MY_BIT, &my_flags)
+		goto do_my_thing;
+	if (requeue)
+		return; // to slow_work
+
 
 ===============
 ITEM OPERATIONS
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index bfd3ab4c8898..5035a2691739 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -152,6 +152,9 @@ static inline void delayed_slow_work_cancel(struct delayed_slow_work *dwork)
 	slow_work_cancel(&dwork->work);
 }
 
+extern bool slow_work_sleep_till_thread_needed(struct slow_work *work,
+					       signed long *_timeout);
+
 #ifdef CONFIG_SYSCTL
 extern ctl_table slow_work_sysctls[];
 #endif
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index b763bc2d2670..da94f3c101af 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -132,6 +132,15 @@ LIST_HEAD(slow_work_queue);
 LIST_HEAD(vslow_work_queue);
 DEFINE_SPINLOCK(slow_work_queue_lock);
 
+/*
+ * The following are two wait queues that get pinged when a work item is placed
+ * on an empty queue.  These allow work items that are hogging a thread by
+ * sleeping in a way that could be deferred to yield their thread and enqueue
+ * themselves.
+ */
+static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation);
+static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation);
+
 /*
  * The thread controls.  A variable used to signal to the threads that they
  * should exit when the queue is empty, a waitqueue used by the threads to wait
@@ -305,6 +314,50 @@ auto_requeue:
 	return true;
 }
 
+/**
+ * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work
+ * work: The work item under execution that wants to sleep
+ * _timeout: Scheduler sleep timeout
+ *
+ * Allow a requeueable work item to sleep on a slow-work processor thread until
+ * that thread is needed to do some other work or the sleep is interrupted by
+ * some other event.
+ *
+ * The caller must set up a wake up event before calling this and must have set
+ * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
+ * condition before calling this function as no test is made here.
+ *
+ * False is returned if there is nothing on the queue; true is returned if the
+ * work item should be requeued
+ */
+bool slow_work_sleep_till_thread_needed(struct slow_work *work,
+					signed long *_timeout)
+{
+	wait_queue_head_t *wfo_wq;
+	struct list_head *queue;
+
+	DEFINE_WAIT(wait);
+
+	if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
+		wfo_wq = &vslow_work_queue_waits_for_occupation;
+		queue = &vslow_work_queue;
+	} else {
+		wfo_wq = &slow_work_queue_waits_for_occupation;
+		queue = &slow_work_queue;
+	}
+
+	if (!list_empty(queue))
+		return true;
+
+	add_wait_queue_exclusive(wfo_wq, &wait);
+	if (list_empty(queue))
+		*_timeout = schedule_timeout(*_timeout);
+	finish_wait(wfo_wq, &wait);
+
+	return !list_empty(queue);
+}
+EXPORT_SYMBOL(slow_work_sleep_till_thread_needed);
+
 /**
  * slow_work_enqueue - Schedule a slow work item for processing
  * @work: The work item to queue
@@ -335,6 +388,8 @@ auto_requeue:
  */
 int slow_work_enqueue(struct slow_work *work)
 {
+	wait_queue_head_t *wfo_wq;
+	struct list_head *queue;
 	unsigned long flags;
 	int ret;
 
@@ -354,6 +409,14 @@ int slow_work_enqueue(struct slow_work *work)
 	 * maintaining our promise
 	 */
 	if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
+		if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
+			wfo_wq = &vslow_work_queue_waits_for_occupation;
+			queue = &vslow_work_queue;
+		} else {
+			wfo_wq = &slow_work_queue_waits_for_occupation;
+			queue = &slow_work_queue;
+		}
+
 		spin_lock_irqsave(&slow_work_queue_lock, flags);
 
 		if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags)))
@@ -380,11 +443,13 @@ int slow_work_enqueue(struct slow_work *work)
 			if (ret < 0)
 				goto failed;
 			slow_work_mark_time(work);
-			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
-				list_add_tail(&work->link, &vslow_work_queue);
-			else
-				list_add_tail(&work->link, &slow_work_queue);
+			list_add_tail(&work->link, queue);
 			wake_up(&slow_work_thread_wq);
+
+			/* if someone who could be requeued is sleeping on a
+			 * thread, then ask them to yield their thread */
+			if (work->link.prev == queue)
+				wake_up(wfo_wq);
 		}
 
 		spin_unlock_irqrestore(&slow_work_queue_lock, flags);
@@ -487,9 +552,19 @@ EXPORT_SYMBOL(slow_work_cancel);
  */
 static void delayed_slow_work_timer(unsigned long data)
 {
+	wait_queue_head_t *wfo_wq;
+	struct list_head *queue;
 	struct slow_work *work = (struct slow_work *) data;
 	unsigned long flags;
-	bool queued = false, put = false;
+	bool queued = false, put = false, first = false;
+
+	if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
+		wfo_wq = &vslow_work_queue_waits_for_occupation;
+		queue = &vslow_work_queue;
+	} else {
+		wfo_wq = &slow_work_queue_waits_for_occupation;
+		queue = &slow_work_queue;
+	}
 
 	spin_lock_irqsave(&slow_work_queue_lock, flags);
 	if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) {
@@ -502,17 +577,18 @@ static void delayed_slow_work_timer(unsigned long data)
 			put = true;
 		} else {
 			slow_work_mark_time(work);
-			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
-				list_add_tail(&work->link, &vslow_work_queue);
-			else
-				list_add_tail(&work->link, &slow_work_queue);
+			list_add_tail(&work->link, queue);
 			queued = true;
+			if (work->link.prev == queue)
+				first = true;
 		}
 	}
 
 	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
 	if (put)
 		slow_work_put_ref(work);
+	if (first)
+		wake_up(wfo_wq);
 	if (queued)
 		wake_up(&slow_work_thread_wq);
 }
-- 
cgit v1.2.3


From 440f0affe247e9990c8f8778f1861da4fd7d5e50 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 19 Nov 2009 18:11:01 +0000
Subject: FS-Cache: Annotate slow-work runqueue proc lines for FS-Cache work
 items

Annotate slow-work runqueue proc lines for FS-Cache work items.  Objects
include the object ID and the state.  Operations include the object ID, the
operation ID and the operation type and state.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/fscache/object.c           | 41 ++++++++++++++++++++++++++++++++++++++++-
 fs/fscache/operation.c        | 29 +++++++++++++++++++++++++++++
 fs/fscache/page.c             | 27 ++++++++++++++++++++++-----
 include/linux/fscache-cache.h | 12 ++++++++++++
 4 files changed, 103 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index d236eb1d6f37..615b63dd9ecc 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -14,9 +14,10 @@
 
 #define FSCACHE_DEBUG_LEVEL COOKIE
 #include <linux/module.h>
+#include <linux/seq_file.h>
 #include "internal.h"
 
-const char *fscache_object_states[] = {
+const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
 	[FSCACHE_OBJECT_INIT]		= "OBJECT_INIT",
 	[FSCACHE_OBJECT_LOOKING_UP]	= "OBJECT_LOOKING_UP",
 	[FSCACHE_OBJECT_CREATING]	= "OBJECT_CREATING",
@@ -33,9 +34,28 @@ const char *fscache_object_states[] = {
 };
 EXPORT_SYMBOL(fscache_object_states);
 
+static const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
+	[FSCACHE_OBJECT_INIT]		= "INIT",
+	[FSCACHE_OBJECT_LOOKING_UP]	= "LOOK",
+	[FSCACHE_OBJECT_CREATING]	= "CRTN",
+	[FSCACHE_OBJECT_AVAILABLE]	= "AVBL",
+	[FSCACHE_OBJECT_ACTIVE]		= "ACTV",
+	[FSCACHE_OBJECT_UPDATING]	= "UPDT",
+	[FSCACHE_OBJECT_DYING]		= "DYNG",
+	[FSCACHE_OBJECT_LC_DYING]	= "LCDY",
+	[FSCACHE_OBJECT_ABORT_INIT]	= "ABTI",
+	[FSCACHE_OBJECT_RELEASING]	= "RELS",
+	[FSCACHE_OBJECT_RECYCLING]	= "RCYC",
+	[FSCACHE_OBJECT_WITHDRAWING]	= "WTHD",
+	[FSCACHE_OBJECT_DEAD]		= "DEAD",
+};
+
 static void fscache_object_slow_work_put_ref(struct slow_work *);
 static int  fscache_object_slow_work_get_ref(struct slow_work *);
 static void fscache_object_slow_work_execute(struct slow_work *);
+#ifdef CONFIG_SLOW_WORK_PROC
+static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
+#endif
 static void fscache_initialise_object(struct fscache_object *);
 static void fscache_lookup_object(struct fscache_object *);
 static void fscache_object_available(struct fscache_object *);
@@ -49,6 +69,9 @@ const struct slow_work_ops fscache_object_slow_work_ops = {
 	.get_ref	= fscache_object_slow_work_get_ref,
 	.put_ref	= fscache_object_slow_work_put_ref,
 	.execute	= fscache_object_slow_work_execute,
+#ifdef CONFIG_SLOW_WORK_PROC
+	.desc		= fscache_object_slow_work_desc,
+#endif
 };
 EXPORT_SYMBOL(fscache_object_slow_work_ops);
 
@@ -326,6 +349,22 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
 		fscache_enqueue_object(object);
 }
 
+/*
+ * describe an object for slow-work debugging
+ */
+#ifdef CONFIG_SLOW_WORK_PROC
+static void fscache_object_slow_work_desc(struct slow_work *work,
+					  struct seq_file *m)
+{
+	struct fscache_object *object =
+		container_of(work, struct fscache_object, work);
+
+	seq_printf(m, "FSC: OBJ%x: %s",
+		   object->debug_id,
+		   fscache_object_states_short[object->state]);
+}
+#endif
+
 /*
  * initialise an object
  * - check the specified object's parent to see if we can make use of it
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index f1a2857b2ff5..91bbe6f0377c 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -13,6 +13,7 @@
 
 #define FSCACHE_DEBUG_LEVEL OPERATION
 #include <linux/module.h>
+#include <linux/seq_file.h>
 #include "internal.h"
 
 atomic_t fscache_op_debug_id;
@@ -31,6 +32,8 @@ void fscache_enqueue_operation(struct fscache_operation *op)
 	_enter("{OBJ%x OP%x,%u}",
 	       op->object->debug_id, op->debug_id, atomic_read(&op->usage));
 
+	fscache_set_op_state(op, "EnQ");
+
 	ASSERT(op->processor != NULL);
 	ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
 	ASSERTCMP(atomic_read(&op->usage), >, 0);
@@ -67,6 +70,8 @@ EXPORT_SYMBOL(fscache_enqueue_operation);
 static void fscache_run_op(struct fscache_object *object,
 			   struct fscache_operation *op)
 {
+	fscache_set_op_state(op, "Run");
+
 	object->n_in_progress++;
 	if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
 		wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
@@ -87,6 +92,8 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
 
 	_enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
 
+	fscache_set_op_state(op, "SubmitX");
+
 	spin_lock(&object->lock);
 	ASSERTCMP(object->n_ops, >=, object->n_in_progress);
 	ASSERTCMP(object->n_ops, >=, object->n_exclusive);
@@ -190,6 +197,8 @@ int fscache_submit_op(struct fscache_object *object,
 
 	ASSERTCMP(atomic_read(&op->usage), >, 0);
 
+	fscache_set_op_state(op, "Submit");
+
 	spin_lock(&object->lock);
 	ASSERTCMP(object->n_ops, >=, object->n_in_progress);
 	ASSERTCMP(object->n_ops, >=, object->n_exclusive);
@@ -298,6 +307,8 @@ void fscache_put_operation(struct fscache_operation *op)
 	if (!atomic_dec_and_test(&op->usage))
 		return;
 
+	fscache_set_op_state(op, "Put");
+
 	_debug("PUT OP");
 	if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags))
 		BUG();
@@ -452,9 +463,27 @@ static void fscache_op_execute(struct slow_work *work)
 	_leave("");
 }
 
+/*
+ * describe an operation for slow-work debugging
+ */
+#ifdef CONFIG_SLOW_WORK_PROC
+static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
+{
+	struct fscache_operation *op =
+		container_of(work, struct fscache_operation, slow_work);
+
+	seq_printf(m, "FSC: OBJ%x OP%x: %s/%s fl=%lx",
+		   op->object->debug_id, op->debug_id,
+		   op->name, op->state, op->flags);
+}
+#endif
+
 const struct slow_work_ops fscache_op_slow_work_ops = {
 	.owner		= THIS_MODULE,
 	.get_ref	= fscache_op_get_ref,
 	.put_ref	= fscache_op_put_ref,
 	.execute	= fscache_op_execute,
+#ifdef CONFIG_SLOW_WORK_PROC
+	.desc		= fscache_op_desc,
+#endif
 };
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 2568e0eb644f..e8bbc395cef6 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -63,14 +63,19 @@ static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *p
 static void fscache_attr_changed_op(struct fscache_operation *op)
 {
 	struct fscache_object *object = op->object;
+	int ret;
 
 	_enter("{OBJ%x OP%x}", object->debug_id, op->debug_id);
 
 	fscache_stat(&fscache_n_attr_changed_calls);
 
-	if (fscache_object_is_active(object) &&
-	    object->cache->ops->attr_changed(object) < 0)
-		fscache_abort_object(object);
+	if (fscache_object_is_active(object)) {
+		fscache_set_op_state(op, "CallFS");
+		ret = object->cache->ops->attr_changed(object);
+		fscache_set_op_state(op, "Done");
+		if (ret < 0)
+			fscache_abort_object(object);
+	}
 
 	_leave("");
 }
@@ -99,6 +104,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 	fscache_operation_init(op, NULL);
 	fscache_operation_init_slow(op, fscache_attr_changed_op);
 	op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE);
+	fscache_set_op_name(op, "Attr");
 
 	spin_lock(&cookie->lock);
 
@@ -184,6 +190,7 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
 	op->start_time	= jiffies;
 	INIT_WORK(&op->op.fast_work, fscache_retrieval_work);
 	INIT_LIST_HEAD(&op->to_do);
+	fscache_set_op_name(&op->op, "Retr");
 	return op;
 }
 
@@ -257,6 +264,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 		_leave(" = -ENOMEM");
 		return -ENOMEM;
 	}
+	fscache_set_op_name(&op->op, "RetrRA1");
 
 	spin_lock(&cookie->lock);
 
@@ -369,6 +377,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 	op = fscache_alloc_retrieval(mapping, end_io_func, context);
 	if (!op)
 		return -ENOMEM;
+	fscache_set_op_name(&op->op, "RetrRAN");
 
 	spin_lock(&cookie->lock);
 
@@ -461,6 +470,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
 	op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
 	if (!op)
 		return -ENOMEM;
+	fscache_set_op_name(&op->op, "RetrAL1");
 
 	spin_lock(&cookie->lock);
 
@@ -529,6 +539,8 @@ static void fscache_write_op(struct fscache_operation *_op)
 
 	_enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
 
+	fscache_set_op_state(&op->op, "GetPage");
+
 	spin_lock(&cookie->lock);
 	spin_lock(&object->lock);
 
@@ -559,13 +571,17 @@ static void fscache_write_op(struct fscache_operation *_op)
 	spin_unlock(&cookie->lock);
 
 	if (page) {
+		fscache_set_op_state(&op->op, "Store");
 		ret = object->cache->ops->write_page(op, page);
+		fscache_set_op_state(&op->op, "EndWrite");
 		fscache_end_page_write(cookie, page);
 		page_cache_release(page);
-		if (ret < 0)
+		if (ret < 0) {
+			fscache_set_op_state(&op->op, "Abort");
 			fscache_abort_object(object);
-		else
+		} else {
 			fscache_enqueue_operation(&op->op);
+		}
 	}
 
 	_leave("");
@@ -634,6 +650,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 	fscache_operation_init(&op->op, fscache_release_write_op);
 	fscache_operation_init_slow(&op->op, fscache_write_op);
 	op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING);
+	fscache_set_op_name(&op->op, "Write1");
 
 	ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
 	if (ret < 0)
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 84d3532dd3ea..7a9847ccd192 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -102,6 +102,16 @@ struct fscache_operation {
 
 	/* operation releaser */
 	fscache_operation_release_t release;
+
+#ifdef CONFIG_SLOW_WORK_PROC
+	const char *name;		/* operation name */
+	const char *state;		/* operation state */
+#define fscache_set_op_name(OP, N)	do { (OP)->name  = (N); } while(0)
+#define fscache_set_op_state(OP, S)	do { (OP)->state = (S); } while(0)
+#else
+#define fscache_set_op_name(OP, N)	do { } while(0)
+#define fscache_set_op_state(OP, S)	do { } while(0)
+#endif
 };
 
 extern atomic_t fscache_op_debug_id;
@@ -125,6 +135,7 @@ static inline void fscache_operation_init(struct fscache_operation *op,
 	op->debug_id = atomic_inc_return(&fscache_op_debug_id);
 	op->release = release;
 	INIT_LIST_HEAD(&op->pend_link);
+	fscache_set_op_state(op, "Init");
 }
 
 /**
@@ -337,6 +348,7 @@ struct fscache_object {
 		FSCACHE_OBJECT_RECYCLING,	/* retiring object */
 		FSCACHE_OBJECT_WITHDRAWING,	/* withdrawing object */
 		FSCACHE_OBJECT_DEAD,		/* object is now dead */
+		FSCACHE_OBJECT__NSTATES
 	} state;
 
 	int			debug_id;	/* debugging ID */
-- 
cgit v1.2.3


From 4fbf4291aa15926cd4fdca0ffe9122e89d0459db Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 19 Nov 2009 18:11:04 +0000
Subject: FS-Cache: Allow the current state of all objects to be dumped

Allow the current state of all fscache objects to be dumped by doing:

	cat /proc/fs/fscache/objects

By default, all objects and all fields will be shown.  This can be restricted
by adding a suitable key to one of the caller's keyrings (such as the session
keyring):

	keyctl add user fscache:objlist "<restrictions>" @s

The <restrictions> are:

	K	Show hexdump of object key (don't show if not given)
	A	Show hexdump of object aux data (don't show if not given)

And paired restrictions:

	C	Show objects that have a cookie
	c	Show objects that don't have a cookie
	B	Show objects that are busy
	b	Show objects that aren't busy
	W	Show objects that have pending writes
	w	Show objects that don't have pending writes
	R	Show objects that have outstanding reads
	r	Show objects that don't have outstanding reads
	S	Show objects that have slow work queued
	s	Show objects that don't have slow work queued

If neither side of a restriction pair is given, then both are implied.  For
example:

	keyctl add user fscache:objlist KB @s

shows objects that are busy, and lists their object keys, but does not dump
their auxiliary data.  It also implies "CcWwRrSs", but as 'B' is given, 'b' is
not implied.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/filesystems/caching/fscache.txt |  81 +++++
 fs/cachefiles/interface.c                     |   1 +
 fs/cachefiles/rdwr.c                          |   6 +-
 fs/fscache/Kconfig                            |   7 +
 fs/fscache/Makefile                           |   1 +
 fs/fscache/cache.c                            |   1 +
 fs/fscache/cookie.c                           |   2 +
 fs/fscache/internal.h                         |  13 +
 fs/fscache/object-list.c                      | 432 ++++++++++++++++++++++++++
 fs/fscache/object.c                           |   2 +-
 fs/fscache/operation.c                        |   3 +
 fs/fscache/page.c                             |   6 +
 fs/fscache/proc.c                             |  13 +
 include/linux/fscache-cache.h                 |  13 +
 14 files changed, 578 insertions(+), 3 deletions(-)
 create mode 100644 fs/fscache/object-list.c

(limited to 'include/linux')

diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt
index 9e94b9491d89..cac09e11ca30 100644
--- a/Documentation/filesystems/caching/fscache.txt
+++ b/Documentation/filesystems/caching/fscache.txt
@@ -299,6 +299,87 @@ proc files.
      jiffy range covered, and the SECS field the equivalent number of seconds.
 
 
+===========
+OBJECT LIST
+===========
+
+If CONFIG_FSCACHE_OBJECT_LIST is enabled, the FS-Cache facility will maintain a
+list of all the objects currently allocated and allow them to be viewed
+through:
+
+	/proc/fs/fscache/objects
+
+This will look something like:
+
+	[root@andromeda ~]# head /proc/fs/fscache/objects
+	OBJECT   PARENT   STAT CHLDN OPS OOP IPR EX READS EM EV F S | NETFS_COOKIE_DEF TY FL NETFS_DATA       OBJECT_KEY, AUX_DATA
+	======== ======== ==== ===== === === === == ===== == == = = | ================ == == ================ ================
+	   17e4b        2 ACTV     0   0   0   0  0     0 7b  4 0 8 | NFS.fh           DT  0 ffff88001dd82820 010006017edcf8bbc93b43298fdfbe71e50b57b13a172c0117f38472, e567634700000000000000000000000063f2404a000000000000000000000000c9030000000000000000000063f2404a
+	   1693a        2 ACTV     0   0   0   0  0     0 7b  4 0 8 | NFS.fh           DT  0 ffff88002db23380 010006017edcf8bbc93b43298fdfbe71e50b57b1e0162c01a2df0ea6, 420ebc4a000000000000000000000000420ebc4a0000000000000000000000000e1801000000000000000000420ebc4a
+
+where the first set of columns before the '|' describe the object:
+
+	COLUMN	DESCRIPTION
+	=======	===============================================================
+	OBJECT	Object debugging ID (appears as OBJ%x in some debug messages)
+	PARENT	Debugging ID of parent object
+	STAT	Object state
+	CHLDN	Number of child objects of this object
+	OPS	Number of outstanding operations on this object
+	OOP	Number of outstanding child object management operations
+	IPR
+	EX	Number of outstanding exclusive operations
+	READS	Number of outstanding read operations
+	EM	Object's event mask
+	EV	Events raised on this object
+	F	Object flags
+	S	Object slow-work work item flags
+
+and the second set of columns describe the object's cookie, if present:
+
+	COLUMN		DESCRIPTION
+	===============	=======================================================
+	NETFS_COOKIE_DEF Name of netfs cookie definition
+	TY		Cookie type (IX - index, DT - data, hex - special)
+	FL		Cookie flags
+	NETFS_DATA	Netfs private data stored in the cookie
+	OBJECT_KEY	Object key	} 1 column, with separating comma
+	AUX_DATA	Object aux data	} presence may be configured
+
+The data shown may be filtered by attaching the a key to an appropriate keyring
+before viewing the file.  Something like:
+
+		keyctl add user fscache:objlist <restrictions> @s
+
+where <restrictions> are a selection of the following letters:
+
+	K	Show hexdump of object key (don't show if not given)
+	A	Show hexdump of object aux data (don't show if not given)
+
+and the following paired letters:
+
+	C	Show objects that have a cookie
+	c	Show objects that don't have a cookie
+	B	Show objects that are busy
+	b	Show objects that aren't busy
+	W	Show objects that have pending writes
+	w	Show objects that don't have pending writes
+	R	Show objects that have outstanding reads
+	r	Show objects that don't have outstanding reads
+	S	Show objects that have slow work queued
+	s	Show objects that don't have slow work queued
+
+If neither side of a letter pair is given, then both are implied.  For example:
+
+	keyctl add user fscache:objlist KB @s
+
+shows objects that are busy, and lists their object keys, but does not dump
+their auxiliary data.  It also implies "CcWwRrSs", but as 'B' is given, 'b' is
+not implied.
+
+By default all objects and all fields will be shown.
+
+
 =========
 DEBUGGING
 =========
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 431accd475a7..dd7f852746cb 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -331,6 +331,7 @@ static void cachefiles_put_object(struct fscache_object *_object)
 		}
 
 		cache = object->fscache.cache;
+		fscache_object_destroy(&object->fscache);
 		kmem_cache_free(cachefiles_object_jar, object);
 		fscache_object_destroyed(cache);
 	}
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index a69787e7dd96..3304646dae84 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -333,7 +333,8 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
 
 	shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
 
-	op->op.flags = FSCACHE_OP_FAST;
+	op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
+	op->op.flags |= FSCACHE_OP_FAST;
 	op->op.processor = cachefiles_read_copier;
 
 	pagevec_init(&pagevec, 0);
@@ -639,7 +640,8 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
 
 	pagevec_init(&pagevec, 0);
 
-	op->op.flags = FSCACHE_OP_FAST;
+	op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
+	op->op.flags |= FSCACHE_OP_FAST;
 	op->op.processor = cachefiles_read_copier;
 
 	INIT_LIST_HEAD(&backpages);
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index 9bbb8ce7bea0..864dac20a242 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -54,3 +54,10 @@ config FSCACHE_DEBUG
 	  enabled by setting bits in /sys/modules/fscache/parameter/debug.
 
 	  See Documentation/filesystems/caching/fscache.txt for more information.
+
+config FSCACHE_OBJECT_LIST
+	bool "Maintain global object list for debugging purposes"
+	depends on FSCACHE && PROC_FS
+	help
+	  Maintain a global list of active fscache objects that can be
+	  retrieved through /proc/fs/fscache/objects for debugging purposes
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
index 91571b95aacc..6d561531cb36 100644
--- a/fs/fscache/Makefile
+++ b/fs/fscache/Makefile
@@ -15,5 +15,6 @@ fscache-y := \
 fscache-$(CONFIG_PROC_FS) += proc.o
 fscache-$(CONFIG_FSCACHE_STATS) += stats.o
 fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o
+fscache-$(CONFIG_FSCACHE_OBJECT_LIST) += object-list.o
 
 obj-$(CONFIG_FSCACHE) := fscache.o
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index e21985bbb1fb..724384ef96de 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -263,6 +263,7 @@ int fscache_add_cache(struct fscache_cache *cache,
 	spin_lock(&cache->object_list_lock);
 	list_add_tail(&ifsdef->cache_link, &cache->object_list);
 	spin_unlock(&cache->object_list_lock);
+	fscache_objlist_add(ifsdef);
 
 	/* add the cache's netfs definition index object to the top level index
 	 * cookie as a known backing object */
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 72fd18f6c71f..9b5187328230 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -349,6 +349,8 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
 	object->cookie = cookie;
 	atomic_inc(&cookie->usage);
 	hlist_add_head(&object->cookie_link, &cookie->backing_objects);
+
+	fscache_objlist_add(object);
 	ret = 0;
 
 cant_attach_object:
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 1c341304621f..fe02973a9516 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -88,10 +88,23 @@ extern int fscache_wait_bit_interruptible(void *);
 /*
  * object.c
  */
+extern const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5];
+
 extern void fscache_withdrawing_object(struct fscache_cache *,
 				       struct fscache_object *);
 extern void fscache_enqueue_object(struct fscache_object *);
 
+/*
+ * object-list.c
+ */
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+extern const struct file_operations fscache_objlist_fops;
+
+extern void fscache_objlist_add(struct fscache_object *);
+#else
+#define fscache_objlist_add(object) do {} while(0)
+#endif
+
 /*
  * operation.c
  */
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
new file mode 100644
index 000000000000..e590242fa41a
--- /dev/null
+++ b/fs/fscache/object-list.c
@@ -0,0 +1,432 @@
+/* Global fscache object list maintainer and viewer
+ *
+ * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/key.h>
+#include <keys/user-type.h>
+#include "internal.h"
+
+static struct rb_root fscache_object_list;
+static DEFINE_RWLOCK(fscache_object_list_lock);
+
+struct fscache_objlist_data {
+	unsigned long	config;		/* display configuration */
+#define FSCACHE_OBJLIST_CONFIG_KEY	0x00000001	/* show object keys */
+#define FSCACHE_OBJLIST_CONFIG_AUX	0x00000002	/* show object auxdata */
+#define FSCACHE_OBJLIST_CONFIG_COOKIE	0x00000004	/* show objects with cookies */
+#define FSCACHE_OBJLIST_CONFIG_NOCOOKIE	0x00000008	/* show objects without cookies */
+#define FSCACHE_OBJLIST_CONFIG_BUSY	0x00000010	/* show busy objects */
+#define FSCACHE_OBJLIST_CONFIG_IDLE	0x00000020	/* show idle objects */
+#define FSCACHE_OBJLIST_CONFIG_PENDWR	0x00000040	/* show objects with pending writes */
+#define FSCACHE_OBJLIST_CONFIG_NOPENDWR	0x00000080	/* show objects without pending writes */
+#define FSCACHE_OBJLIST_CONFIG_READS	0x00000100	/* show objects with active reads */
+#define FSCACHE_OBJLIST_CONFIG_NOREADS	0x00000200	/* show objects without active reads */
+#define FSCACHE_OBJLIST_CONFIG_EVENTS	0x00000400	/* show objects with events */
+#define FSCACHE_OBJLIST_CONFIG_NOEVENTS	0x00000800	/* show objects without no events */
+#define FSCACHE_OBJLIST_CONFIG_WORK	0x00001000	/* show objects with slow work */
+#define FSCACHE_OBJLIST_CONFIG_NOWORK	0x00002000	/* show objects without slow work */
+
+	u8		buf[512];	/* key and aux data buffer */
+};
+
+/*
+ * Add an object to the object list
+ * - we use the address of the fscache_object structure as the key into the
+ *   tree
+ */
+void fscache_objlist_add(struct fscache_object *obj)
+{
+	struct fscache_object *xobj;
+	struct rb_node **p = &fscache_object_list.rb_node, *parent = NULL;
+
+	write_lock(&fscache_object_list_lock);
+
+	while (*p) {
+		parent = *p;
+		xobj = rb_entry(parent, struct fscache_object, objlist_link);
+
+		if (obj < xobj)
+			p = &(*p)->rb_left;
+		else if (obj > xobj)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&obj->objlist_link, parent, p);
+	rb_insert_color(&obj->objlist_link, &fscache_object_list);
+
+	write_unlock(&fscache_object_list_lock);
+}
+
+/**
+ * fscache_object_destroy - Note that a cache object is about to be destroyed
+ * @object: The object to be destroyed
+ *
+ * Note the imminent destruction and deallocation of a cache object record.
+ */
+void fscache_object_destroy(struct fscache_object *obj)
+{
+	write_lock(&fscache_object_list_lock);
+
+	BUG_ON(RB_EMPTY_ROOT(&fscache_object_list));
+	rb_erase(&obj->objlist_link, &fscache_object_list);
+
+	write_unlock(&fscache_object_list_lock);
+}
+EXPORT_SYMBOL(fscache_object_destroy);
+
+/*
+ * find the object in the tree on or after the specified index
+ */
+static struct fscache_object *fscache_objlist_lookup(loff_t *_pos)
+{
+	struct fscache_object *pobj, *obj, *minobj = NULL;
+	struct rb_node *p;
+	unsigned long pos;
+
+	if (*_pos >= (unsigned long) ERR_PTR(-ENOENT))
+		return NULL;
+	pos = *_pos;
+
+	/* banners (can't represent line 0 by pos 0 as that would involve
+	 * returning a NULL pointer) */
+	if (pos == 0)
+		return (struct fscache_object *) ++(*_pos);
+	if (pos < 3)
+		return (struct fscache_object *)pos;
+
+	pobj = (struct fscache_object *)pos;
+	p = fscache_object_list.rb_node;
+	while (p) {
+		obj = rb_entry(p, struct fscache_object, objlist_link);
+		if (pobj < obj) {
+			if (!minobj || minobj > obj)
+				minobj = obj;
+			p = p->rb_left;
+		} else if (pobj > obj) {
+			p = p->rb_right;
+		} else {
+			minobj = obj;
+			break;
+		}
+		obj = NULL;
+	}
+
+	if (!minobj)
+		*_pos = (unsigned long) ERR_PTR(-ENOENT);
+	else if (minobj != obj)
+		*_pos = (unsigned long) minobj;
+	return minobj;
+}
+
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *fscache_objlist_start(struct seq_file *m, loff_t *_pos)
+	__acquires(&fscache_object_list_lock)
+{
+	read_lock(&fscache_object_list_lock);
+	return fscache_objlist_lookup(_pos);
+}
+
+/*
+ * move to the next line
+ */
+static void *fscache_objlist_next(struct seq_file *m, void *v, loff_t *_pos)
+{
+	(*_pos)++;
+	return fscache_objlist_lookup(_pos);
+}
+
+/*
+ * clean up after reading
+ */
+static void fscache_objlist_stop(struct seq_file *m, void *v)
+	__releases(&fscache_object_list_lock)
+{
+	read_unlock(&fscache_object_list_lock);
+}
+
+/*
+ * display an object
+ */
+static int fscache_objlist_show(struct seq_file *m, void *v)
+{
+	struct fscache_objlist_data *data = m->private;
+	struct fscache_object *obj = v;
+	unsigned long config = data->config;
+	uint16_t keylen, auxlen;
+	char _type[3], *type;
+	bool no_cookie;
+	u8 *buf = data->buf, *p;
+
+	if ((unsigned long) v == 1) {
+		seq_puts(m, "OBJECT   PARENT   STAT CHLDN OPS OOP IPR EX READS"
+			 " EM EV F S"
+			 " | NETFS_COOKIE_DEF TY FL NETFS_DATA");
+		if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
+			      FSCACHE_OBJLIST_CONFIG_AUX))
+			seq_puts(m, "       ");
+		if (config & FSCACHE_OBJLIST_CONFIG_KEY)
+			seq_puts(m, "OBJECT_KEY");
+		if ((config & (FSCACHE_OBJLIST_CONFIG_KEY |
+			       FSCACHE_OBJLIST_CONFIG_AUX)) ==
+		    (FSCACHE_OBJLIST_CONFIG_KEY | FSCACHE_OBJLIST_CONFIG_AUX))
+			seq_puts(m, ", ");
+		if (config & FSCACHE_OBJLIST_CONFIG_AUX)
+			seq_puts(m, "AUX_DATA");
+		seq_puts(m, "\n");
+		return 0;
+	}
+
+	if ((unsigned long) v == 2) {
+		seq_puts(m, "======== ======== ==== ===== === === === == ====="
+			 " == == = ="
+			 " | ================ == == ================");
+		if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
+			      FSCACHE_OBJLIST_CONFIG_AUX))
+			seq_puts(m, " ================");
+		seq_puts(m, "\n");
+		return 0;
+	}
+
+	/* filter out any unwanted objects */
+#define FILTER(criterion, _yes, _no)					\
+	do {								\
+		unsigned long yes = FSCACHE_OBJLIST_CONFIG_##_yes;	\
+		unsigned long no = FSCACHE_OBJLIST_CONFIG_##_no;	\
+		if (criterion) {					\
+			if (!(config & yes))				\
+				return 0;				\
+		} else {						\
+			if (!(config & no))				\
+				return 0;				\
+		}							\
+	} while(0)
+
+	if (~config) {
+		FILTER(obj->cookie,
+		       COOKIE, NOCOOKIE);
+		FILTER(obj->state != FSCACHE_OBJECT_ACTIVE ||
+		       obj->n_ops != 0 ||
+		       obj->n_obj_ops != 0 ||
+		       obj->flags ||
+		       !list_empty(&obj->dependents),
+		       BUSY, IDLE);
+		FILTER(test_bit(FSCACHE_OBJECT_PENDING_WRITE, &obj->flags),
+		       PENDWR, NOPENDWR);
+		FILTER(atomic_read(&obj->n_reads),
+		       READS, NOREADS);
+		FILTER(obj->events & obj->event_mask,
+		       EVENTS, NOEVENTS);
+		FILTER(obj->work.flags & ~(1UL << SLOW_WORK_VERY_SLOW),
+		       WORK, NOWORK);
+	}
+
+	seq_printf(m,
+		   "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1lx | ",
+		   obj->debug_id,
+		   obj->parent ? obj->parent->debug_id : -1,
+		   fscache_object_states_short[obj->state],
+		   obj->n_children,
+		   obj->n_ops,
+		   obj->n_obj_ops,
+		   obj->n_in_progress,
+		   obj->n_exclusive,
+		   atomic_read(&obj->n_reads),
+		   obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK,
+		   obj->events,
+		   obj->flags,
+		   obj->work.flags);
+
+	no_cookie = true;
+	keylen = auxlen = 0;
+	if (obj->cookie) {
+		spin_lock(&obj->lock);
+		if (obj->cookie) {
+			switch (obj->cookie->def->type) {
+			case 0:
+				type = "IX";
+				break;
+			case 1:
+				type = "DT";
+				break;
+			default:
+				sprintf(_type, "%02u",
+					obj->cookie->def->type);
+				type = _type;
+				break;
+			}
+
+			seq_printf(m, "%-16s %s %2lx %16p",
+				   obj->cookie->def->name,
+				   type,
+				   obj->cookie->flags,
+				   obj->cookie->netfs_data);
+
+			if (obj->cookie->def->get_key &&
+			    config & FSCACHE_OBJLIST_CONFIG_KEY)
+				keylen = obj->cookie->def->get_key(
+					obj->cookie->netfs_data,
+					buf, 400);
+
+			if (obj->cookie->def->get_aux &&
+			    config & FSCACHE_OBJLIST_CONFIG_AUX)
+				auxlen = obj->cookie->def->get_aux(
+					obj->cookie->netfs_data,
+					buf + keylen, 512 - keylen);
+
+			no_cookie = false;
+		}
+		spin_unlock(&obj->lock);
+
+		if (!no_cookie && (keylen > 0 || auxlen > 0)) {
+			seq_printf(m, " ");
+			for (p = buf; keylen > 0; keylen--)
+				seq_printf(m, "%02x", *p++);
+			if (auxlen > 0) {
+				if (config & FSCACHE_OBJLIST_CONFIG_KEY)
+					seq_printf(m, ", ");
+				for (; auxlen > 0; auxlen--)
+					seq_printf(m, "%02x", *p++);
+			}
+		}
+	}
+
+	if (no_cookie)
+		seq_printf(m, "<no_cookie>\n");
+	else
+		seq_printf(m, "\n");
+	return 0;
+}
+
+static const struct seq_operations fscache_objlist_ops = {
+	.start		= fscache_objlist_start,
+	.stop		= fscache_objlist_stop,
+	.next		= fscache_objlist_next,
+	.show		= fscache_objlist_show,
+};
+
+/*
+ * get the configuration for filtering the list
+ */
+static void fscache_objlist_config(struct fscache_objlist_data *data)
+{
+#ifdef CONFIG_KEYS
+	struct user_key_payload *confkey;
+	unsigned long config;
+	struct key *key;
+	const char *buf;
+	int len;
+
+	key = request_key(&key_type_user, "fscache:objlist", NULL);
+	if (IS_ERR(key))
+		goto no_config;
+
+	config = 0;
+	rcu_read_lock();
+
+	confkey = key->payload.data;
+	buf = confkey->data;
+
+	for (len = confkey->datalen - 1; len >= 0; len--) {
+		switch (buf[len]) {
+		case 'K': config |= FSCACHE_OBJLIST_CONFIG_KEY;		break;
+		case 'A': config |= FSCACHE_OBJLIST_CONFIG_AUX;		break;
+		case 'C': config |= FSCACHE_OBJLIST_CONFIG_COOKIE;	break;
+		case 'c': config |= FSCACHE_OBJLIST_CONFIG_NOCOOKIE;	break;
+		case 'B': config |= FSCACHE_OBJLIST_CONFIG_BUSY;	break;
+		case 'b': config |= FSCACHE_OBJLIST_CONFIG_IDLE;	break;
+		case 'W': config |= FSCACHE_OBJLIST_CONFIG_PENDWR;	break;
+		case 'w': config |= FSCACHE_OBJLIST_CONFIG_NOPENDWR;	break;
+		case 'R': config |= FSCACHE_OBJLIST_CONFIG_READS;	break;
+		case 'r': config |= FSCACHE_OBJLIST_CONFIG_NOREADS;	break;
+		case 'S': config |= FSCACHE_OBJLIST_CONFIG_WORK;	break;
+		case 's': config |= FSCACHE_OBJLIST_CONFIG_NOWORK;	break;
+		}
+	}
+
+	rcu_read_unlock();
+	key_put(key);
+
+	if (!(config & (FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE)))
+	    config   |= FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE;
+	if (!(config & (FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE)))
+	    config   |= FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE;
+	if (!(config & (FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR)))
+	    config   |= FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR;
+	if (!(config & (FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS)))
+	    config   |= FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS;
+	if (!(config & (FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS)))
+	    config   |= FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS;
+	if (!(config & (FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK)))
+	    config   |= FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK;
+
+	data->config = config;
+	return;
+
+no_config:
+#endif
+	data->config = ULONG_MAX;
+}
+
+/*
+ * open "/proc/fs/fscache/objects" to provide a list of active objects
+ * - can be configured by a user-defined key added to the caller's keyrings
+ */
+static int fscache_objlist_open(struct inode *inode, struct file *file)
+{
+	struct fscache_objlist_data *data;
+	struct seq_file *m;
+	int ret;
+
+	ret = seq_open(file, &fscache_objlist_ops);
+	if (ret < 0)
+		return ret;
+
+	m = file->private_data;
+
+	/* buffer for key extraction */
+	data = kmalloc(sizeof(struct fscache_objlist_data), GFP_KERNEL);
+	if (!data) {
+		seq_release(inode, file);
+		return -ENOMEM;
+	}
+
+	/* get the configuration key */
+	fscache_objlist_config(data);
+
+	m->private = data;
+	return 0;
+}
+
+/*
+ * clean up on close
+ */
+static int fscache_objlist_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = file->private_data;
+
+	kfree(m->private);
+	m->private = NULL;
+	return seq_release(inode, file);
+}
+
+const struct file_operations fscache_objlist_fops = {
+	.owner		= THIS_MODULE,
+	.open		= fscache_objlist_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= fscache_objlist_release,
+};
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 615b63dd9ecc..ad1644f073bd 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -34,7 +34,7 @@ const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
 };
 EXPORT_SYMBOL(fscache_object_states);
 
-static const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
+const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
 	[FSCACHE_OBJECT_INIT]		= "INIT",
 	[FSCACHE_OBJECT_LOOKING_UP]	= "LOOK",
 	[FSCACHE_OBJECT_CREATING]	= "CRTN",
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 91bbe6f0377c..09e43b6e822f 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -322,6 +322,9 @@ void fscache_put_operation(struct fscache_operation *op)
 
 	object = op->object;
 
+	if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags))
+		atomic_dec(&object->n_reads);
+
 	/* now... we may get called with the object spinlock held, so we
 	 * complete the cleanup here only if we can immediately acquire the
 	 * lock, and defer it otherwise */
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index e8bbc395cef6..c5973e38ce39 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -275,6 +275,9 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 
 	ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
 
+	atomic_inc(&object->n_reads);
+	set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
+
 	if (fscache_submit_op(object, &op->op) < 0)
 		goto nobufs_unlock;
 	spin_unlock(&cookie->lock);
@@ -386,6 +389,9 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
 
+	atomic_inc(&object->n_reads);
+	set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
+
 	if (fscache_submit_op(object, &op->op) < 0)
 		goto nobufs_unlock;
 	spin_unlock(&cookie->lock);
diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c
index beeab44bc31a..1d9e4951a597 100644
--- a/fs/fscache/proc.c
+++ b/fs/fscache/proc.c
@@ -37,10 +37,20 @@ int __init fscache_proc_init(void)
 		goto error_histogram;
 #endif
 
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+	if (!proc_create("fs/fscache/objects", S_IFREG | 0444, NULL,
+			 &fscache_objlist_fops))
+		goto error_objects;
+#endif
+
 	_leave(" = 0");
 	return 0;
 
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+error_objects:
+#endif
 #ifdef CONFIG_FSCACHE_HISTOGRAM
+	remove_proc_entry("fs/fscache/histogram", NULL);
 error_histogram:
 #endif
 #ifdef CONFIG_FSCACHE_STATS
@@ -58,6 +68,9 @@ error_dir:
  */
 void fscache_proc_cleanup(void)
 {
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+	remove_proc_entry("fs/fscache/objects", NULL);
+#endif
 #ifdef CONFIG_FSCACHE_HISTOGRAM
 	remove_proc_entry("fs/fscache/histogram", NULL);
 #endif
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 7a9847ccd192..184cbdfbcc99 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -91,6 +91,8 @@ struct fscache_operation {
 #define FSCACHE_OP_WAITING	4	/* cleared when op is woken */
 #define FSCACHE_OP_EXCLUSIVE	5	/* exclusive op, other ops must wait */
 #define FSCACHE_OP_DEAD		6	/* op is now dead */
+#define FSCACHE_OP_DEC_READ_CNT	7	/* decrement object->n_reads on destruction */
+#define FSCACHE_OP_KEEP_FLAGS	0xc0	/* flags to keep when repurposing an op */
 
 	atomic_t		usage;
 	unsigned		debug_id;	/* debugging ID */
@@ -357,6 +359,7 @@ struct fscache_object {
 	int			n_obj_ops;	/* number of object ops outstanding on object */
 	int			n_in_progress;	/* number of ops in progress */
 	int			n_exclusive;	/* number of exclusive ops queued */
+	atomic_t		n_reads;	/* number of read ops in progress */
 	spinlock_t		lock;		/* state and operations lock */
 
 	unsigned long		lookup_jif;	/* time at which lookup started */
@@ -370,6 +373,7 @@ struct fscache_object {
 #define FSCACHE_OBJECT_EV_RELEASE	4	/* T if netfs requested object release */
 #define FSCACHE_OBJECT_EV_RETIRE	5	/* T if netfs requested object retirement */
 #define FSCACHE_OBJECT_EV_WITHDRAW	6	/* T if cache requested object withdrawal */
+#define FSCACHE_OBJECT_EVENTS_MASK	0x7f	/* mask of all events*/
 
 	unsigned long		flags;
 #define FSCACHE_OBJECT_LOCK		0	/* T if object is busy being processed */
@@ -385,6 +389,9 @@ struct fscache_object {
 	struct list_head	dependents;	/* FIFO of dependent objects */
 	struct list_head	dep_link;	/* link in parent's dependents list */
 	struct list_head	pending_ops;	/* unstarted operations on this object */
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+	struct rb_node		objlist_link;	/* link in global object list */
+#endif
 	pgoff_t			store_limit;	/* current storage limit */
 };
 
@@ -434,6 +441,12 @@ void fscache_object_init(struct fscache_object *object,
 extern void fscache_object_lookup_negative(struct fscache_object *object);
 extern void fscache_obtained_object(struct fscache_object *object);
 
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+extern void fscache_object_destroy(struct fscache_object *object);
+#else
+#define fscache_object_destroy(object) do {} while(0)
+#endif
+
 /**
  * fscache_object_destroyed - Note destruction of an object in a cache
  * @cache: The cache from which the object came
-- 
cgit v1.2.3


From 1bccf513ac49d44604ba1cddcc29f5886e70f1b6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 19 Nov 2009 18:11:25 +0000
Subject: FS-Cache: Fix lock misorder in fscache_write_op()

FS-Cache has two structs internally for keeping track of the internal state of
a cached file: the fscache_cookie struct, which represents the netfs's state,
and fscache_object struct, which represents the cache's state.  Each has a
pointer that points to the other (when both are in existence), and each has a
spinlock for pointer maintenance.

Since netfs operations approach these structures from the cookie side, they get
the cookie lock first, then the object lock.  Cache operations, on the other
hand, approach from the object side, and get the object lock first.  It is not
then permitted for a cache operation to get the cookie lock whilst it is
holding the object lock lest deadlock occur; instead, it must do one of two
things:

 (1) increment the cookie usage counter, drop the object lock and then get both
     locks in order, or

 (2) simply hold the object lock as certain parts of the cookie may not be
     altered whilst the object lock is held.

It is also not permitted to follow either pointer without holding the lock at
the end you start with.  To break the pointers between the cookie and the
object, both locks must be held.

fscache_write_op(), however, violates the locking rules: It attempts to get the
cookie lock without (a) checking that the cookie pointer is a valid pointer,
and (b) holding the object lock to protect the cookie pointer whilst it follows
it.  This is so that it can access the pending page store tree without
interference from __fscache_write_page().

This is fixed by splitting the cookie lock, such that the page store tracking
tree is protected by its own lock, and checking that the cookie pointer is
non-NULL before we attempt to follow it whilst holding the object lock.

The new lock is subordinate to both the cookie lock and the object lock, and so
should be taken after those.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/filesystems/caching/fscache.txt |  3 ++
 fs/fscache/cookie.c                           |  1 +
 fs/fscache/internal.h                         |  4 +++
 fs/fscache/page.c                             | 52 ++++++++++++++++++---------
 fs/fscache/stats.c                            | 10 ++++--
 include/linux/fscache-cache.h                 |  1 +
 6 files changed, 52 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt
index 0a77868f4977..9cf2cfbc81c9 100644
--- a/Documentation/filesystems/caching/fscache.txt
+++ b/Documentation/filesystems/caching/fscache.txt
@@ -269,6 +269,9 @@ proc files.
 		oom=N	Number of store reqs failed -ENOMEM
 		ops=N	Number of store reqs submitted
 		run=N	Number of store reqs granted CPU time
+		pgs=N	Number of pages given store req processing time
+		rxd=N	Number of store reqs deleted from tracking tree
+		olm=N	Number of store reqs over store limit
 	Ops	pend=N	Number of times async ops added to pending queues
 		run=N	Number of times async ops given CPU time
 		enq=N	Number of times async ops queued for processing
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index e6854f5222f5..f979659c1b3f 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -36,6 +36,7 @@ void fscache_cookie_init_once(void *_cookie)
 
 	memset(cookie, 0, sizeof(*cookie));
 	spin_lock_init(&cookie->lock);
+	spin_lock_init(&cookie->stores_lock);
 	INIT_HLIST_HEAD(&cookie->backing_objects);
 }
 
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 50324ad2b194..ba1853fa1ff9 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -17,6 +17,7 @@
  * - cache->object_list_lock
  * - object->lock
  * - object->parent->lock
+ * - cookie->stores_lock
  * - fscache_thread_lock
  *
  */
@@ -174,6 +175,9 @@ extern atomic_t fscache_n_stores_nobufs;
 extern atomic_t fscache_n_stores_oom;
 extern atomic_t fscache_n_store_ops;
 extern atomic_t fscache_n_store_calls;
+extern atomic_t fscache_n_store_pages;
+extern atomic_t fscache_n_store_radix_deletes;
+extern atomic_t fscache_n_store_pages_over_limit;
 
 extern atomic_t fscache_n_marks;
 extern atomic_t fscache_n_uncaches;
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index e6f2e61133a1..3ea8897bc217 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -45,16 +45,26 @@ EXPORT_SYMBOL(__fscache_wait_on_page_write);
 /*
  * note that a page has finished being written to the cache
  */
-static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *page)
+static void fscache_end_page_write(struct fscache_object *object,
+				   struct page *page)
 {
-	struct page *xpage;
+	struct fscache_cookie *cookie;
+	struct page *xpage = NULL;
 
-	spin_lock(&cookie->lock);
-	xpage = radix_tree_delete(&cookie->stores, page->index);
-	spin_unlock(&cookie->lock);
-	ASSERT(xpage != NULL);
-
-	wake_up_bit(&cookie->flags, 0);
+	spin_lock(&object->lock);
+	cookie = object->cookie;
+	if (cookie) {
+		/* delete the page from the tree if it is now no longer
+		 * pending */
+		spin_lock(&cookie->stores_lock);
+		fscache_stat(&fscache_n_store_radix_deletes);
+		xpage = radix_tree_delete(&cookie->stores, page->index);
+		spin_unlock(&cookie->stores_lock);
+		wake_up_bit(&cookie->flags, 0);
+	}
+	spin_unlock(&object->lock);
+	if (xpage)
+		page_cache_release(xpage);
 }
 
 /*
@@ -591,7 +601,7 @@ static void fscache_write_op(struct fscache_operation *_op)
 	struct fscache_storage *op =
 		container_of(_op, struct fscache_storage, op);
 	struct fscache_object *object = op->op.object;
-	struct fscache_cookie *cookie = object->cookie;
+	struct fscache_cookie *cookie;
 	struct page *page;
 	unsigned n;
 	void *results[1];
@@ -601,16 +611,17 @@ static void fscache_write_op(struct fscache_operation *_op)
 
 	fscache_set_op_state(&op->op, "GetPage");
 
-	spin_lock(&cookie->lock);
 	spin_lock(&object->lock);
+	cookie = object->cookie;
 
-	if (!fscache_object_is_active(object)) {
+	if (!fscache_object_is_active(object) || !cookie) {
 		spin_unlock(&object->lock);
-		spin_unlock(&cookie->lock);
 		_leave("");
 		return;
 	}
 
+	spin_lock(&cookie->stores_lock);
+
 	fscache_stat(&fscache_n_store_calls);
 
 	/* find a page to store */
@@ -621,23 +632,25 @@ static void fscache_write_op(struct fscache_operation *_op)
 		goto superseded;
 	page = results[0];
 	_debug("gang %d [%lx]", n, page->index);
-	if (page->index > op->store_limit)
+	if (page->index > op->store_limit) {
+		fscache_stat(&fscache_n_store_pages_over_limit);
 		goto superseded;
+	}
 
 	radix_tree_tag_clear(&cookie->stores, page->index,
 			     FSCACHE_COOKIE_PENDING_TAG);
 
+	spin_unlock(&cookie->stores_lock);
 	spin_unlock(&object->lock);
-	spin_unlock(&cookie->lock);
 
 	if (page) {
 		fscache_set_op_state(&op->op, "Store");
+		fscache_stat(&fscache_n_store_pages);
 		fscache_stat(&fscache_n_cop_write_page);
 		ret = object->cache->ops->write_page(op, page);
 		fscache_stat_d(&fscache_n_cop_write_page);
 		fscache_set_op_state(&op->op, "EndWrite");
-		fscache_end_page_write(cookie, page);
-		page_cache_release(page);
+		fscache_end_page_write(object, page);
 		if (ret < 0) {
 			fscache_set_op_state(&op->op, "Abort");
 			fscache_abort_object(object);
@@ -653,9 +666,9 @@ superseded:
 	/* this writer is going away and there aren't any more things to
 	 * write */
 	_debug("cease");
+	spin_unlock(&cookie->stores_lock);
 	clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
 	spin_unlock(&object->lock);
-	spin_unlock(&cookie->lock);
 	_leave("");
 }
 
@@ -731,6 +744,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 	/* add the page to the pending-storage radix tree on the backing
 	 * object */
 	spin_lock(&object->lock);
+	spin_lock(&cookie->stores_lock);
 
 	_debug("store limit %llx", (unsigned long long) object->store_limit);
 
@@ -751,6 +765,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 	if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags))
 		goto already_pending;
 
+	spin_unlock(&cookie->stores_lock);
 	spin_unlock(&object->lock);
 
 	op->op.debug_id	= atomic_inc_return(&fscache_op_debug_id);
@@ -772,6 +787,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 already_queued:
 	fscache_stat(&fscache_n_stores_again);
 already_pending:
+	spin_unlock(&cookie->stores_lock);
 	spin_unlock(&object->lock);
 	spin_unlock(&cookie->lock);
 	radix_tree_preload_end();
@@ -781,7 +797,9 @@ already_pending:
 	return 0;
 
 submit_failed:
+	spin_lock(&cookie->stores_lock);
 	radix_tree_delete(&cookie->stores, page->index);
+	spin_unlock(&cookie->stores_lock);
 	page_cache_release(page);
 	ret = -ENOBUFS;
 	goto nobufs;
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 4c07439d1307..1d53ea68409e 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -58,6 +58,9 @@ atomic_t fscache_n_stores_nobufs;
 atomic_t fscache_n_stores_oom;
 atomic_t fscache_n_store_ops;
 atomic_t fscache_n_store_calls;
+atomic_t fscache_n_store_pages;
+atomic_t fscache_n_store_radix_deletes;
+atomic_t fscache_n_store_pages_over_limit;
 
 atomic_t fscache_n_marks;
 atomic_t fscache_n_uncaches;
@@ -200,9 +203,12 @@ static int fscache_stats_show(struct seq_file *m, void *v)
 		   atomic_read(&fscache_n_stores_again),
 		   atomic_read(&fscache_n_stores_nobufs),
 		   atomic_read(&fscache_n_stores_oom));
-	seq_printf(m, "Stores : ops=%u run=%u\n",
+	seq_printf(m, "Stores : ops=%u run=%u pgs=%u rxd=%u olm=%u\n",
 		   atomic_read(&fscache_n_store_ops),
-		   atomic_read(&fscache_n_store_calls));
+		   atomic_read(&fscache_n_store_calls),
+		   atomic_read(&fscache_n_store_pages),
+		   atomic_read(&fscache_n_store_radix_deletes),
+		   atomic_read(&fscache_n_store_pages_over_limit));
 
 	seq_printf(m, "Ops    : pend=%u run=%u enq=%u can=%u\n",
 		   atomic_read(&fscache_n_op_pend),
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 184cbdfbcc99..f3aa4bdafef6 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -310,6 +310,7 @@ struct fscache_cookie {
 	atomic_t			usage;		/* number of users of this cookie */
 	atomic_t			n_children;	/* number of children of this cookie */
 	spinlock_t			lock;
+	spinlock_t			stores_lock;	/* lock on page store tree */
 	struct hlist_head		backing_objects; /* object(s) backing this file/index */
 	const struct fscache_cookie_def	*def;		/* definition */
 	struct fscache_cookie		*parent;	/* parent of this entry */
-- 
cgit v1.2.3


From 201a15428bd54f83eccec8b7c64a04b8f9431204 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 19 Nov 2009 18:11:35 +0000
Subject: FS-Cache: Handle pages pending storage that get evicted under OOM
 conditions

Handle netfs pages that the vmscan algorithm wants to evict from the pagecache
under OOM conditions, but that are waiting for write to the cache.  Under these
conditions, vmscan calls the releasepage() function of the netfs, asking if a
page can be discarded.

The problem is typified by the following trace of a stuck process:

	kslowd005     D 0000000000000000     0  4253      2 0x00000080
	 ffff88001b14f370 0000000000000046 ffff880020d0d000 0000000000000007
	 0000000000000006 0000000000000001 ffff88001b14ffd8 ffff880020d0d2a8
	 000000000000ddf0 00000000000118c0 00000000000118c0 ffff880020d0d2a8
	Call Trace:
	 [<ffffffffa00782d8>] __fscache_wait_on_page_write+0x8b/0xa7 [fscache]
	 [<ffffffff8104c0f1>] ? autoremove_wake_function+0x0/0x34
	 [<ffffffffa0078240>] ? __fscache_check_page_write+0x63/0x70 [fscache]
	 [<ffffffffa00b671d>] nfs_fscache_release_page+0x4e/0xc4 [nfs]
	 [<ffffffffa00927f0>] nfs_release_page+0x3c/0x41 [nfs]
	 [<ffffffff810885d3>] try_to_release_page+0x32/0x3b
	 [<ffffffff81093203>] shrink_page_list+0x316/0x4ac
	 [<ffffffff8109372b>] shrink_inactive_list+0x392/0x67c
	 [<ffffffff813532fa>] ? __mutex_unlock_slowpath+0x100/0x10b
	 [<ffffffff81058df0>] ? trace_hardirqs_on_caller+0x10c/0x130
	 [<ffffffff8135330e>] ? mutex_unlock+0x9/0xb
	 [<ffffffff81093aa2>] shrink_list+0x8d/0x8f
	 [<ffffffff81093d1c>] shrink_zone+0x278/0x33c
	 [<ffffffff81052d6c>] ? ktime_get_ts+0xad/0xba
	 [<ffffffff81094b13>] try_to_free_pages+0x22e/0x392
	 [<ffffffff81091e24>] ? isolate_pages_global+0x0/0x212
	 [<ffffffff8108e743>] __alloc_pages_nodemask+0x3dc/0x5cf
	 [<ffffffff81089529>] grab_cache_page_write_begin+0x65/0xaa
	 [<ffffffff8110f8c0>] ext3_write_begin+0x78/0x1eb
	 [<ffffffff81089ec5>] generic_file_buffered_write+0x109/0x28c
	 [<ffffffff8103cb69>] ? current_fs_time+0x22/0x29
	 [<ffffffff8108a509>] __generic_file_aio_write+0x350/0x385
	 [<ffffffff8108a588>] ? generic_file_aio_write+0x4a/0xae
	 [<ffffffff8108a59e>] generic_file_aio_write+0x60/0xae
	 [<ffffffff810b2e82>] do_sync_write+0xe3/0x120
	 [<ffffffff8104c0f1>] ? autoremove_wake_function+0x0/0x34
	 [<ffffffff810b18e1>] ? __dentry_open+0x1a5/0x2b8
	 [<ffffffff810b1a76>] ? dentry_open+0x82/0x89
	 [<ffffffffa00e693c>] cachefiles_write_page+0x298/0x335 [cachefiles]
	 [<ffffffffa0077147>] fscache_write_op+0x178/0x2c2 [fscache]
	 [<ffffffffa0075656>] fscache_op_execute+0x7a/0xd1 [fscache]
	 [<ffffffff81082093>] slow_work_execute+0x18f/0x2d1
	 [<ffffffff8108239a>] slow_work_thread+0x1c5/0x308
	 [<ffffffff8104c0f1>] ? autoremove_wake_function+0x0/0x34
	 [<ffffffff810821d5>] ? slow_work_thread+0x0/0x308
	 [<ffffffff8104be91>] kthread+0x7a/0x82
	 [<ffffffff8100beda>] child_rip+0xa/0x20
	 [<ffffffff8100b87c>] ? restore_args+0x0/0x30
	 [<ffffffff8102ef83>] ? tg_shares_up+0x171/0x227
	 [<ffffffff8104be17>] ? kthread+0x0/0x82
	 [<ffffffff8100bed0>] ? child_rip+0x0/0x20

In the above backtrace, the following is happening:

 (1) A page storage operation is being executed by a slow-work thread
     (fscache_write_op()).

 (2) FS-Cache farms the operation out to the cache to perform
     (cachefiles_write_page()).

 (3) CacheFiles is then calling Ext3 to perform the actual write, using Ext3's
     standard write (do_sync_write()) under KERNEL_DS directly from the netfs
     page.

 (4) However, for Ext3 to perform the write, it must allocate some memory, in
     particular, it must allocate at least one page cache page into which it
     can copy the data from the netfs page.

 (5) Under OOM conditions, the memory allocator can't immediately come up with
     a page, so it uses vmscan to find something to discard
     (try_to_free_pages()).

 (6) vmscan finds a clean netfs page it might be able to discard (possibly the
     one it's trying to write out).

 (7) The netfs is called to throw the page away (nfs_release_page()) - but it's
     called with __GFP_WAIT, so the netfs decides to wait for the store to
     complete (__fscache_wait_on_page_write()).

 (8) This blocks a slow-work processing thread - possibly against itself.

The system ends up stuck because it can't write out any netfs pages to the
cache without allocating more memory.

To avoid this, we make FS-Cache cancel some writes that aren't in the middle of
actually being performed.  This means that some data won't make it into the
cache this time.  To support this, a new FS-Cache function is added
fscache_maybe_release_page() that replaces what the netfs releasepage()
functions used to do with respect to the cache.

The decisions fscache_maybe_release_page() makes are counted and displayed
through /proc/fs/fscache/stats on a line labelled "VmScan".  There are four
counters provided: "nos=N" - pages that weren't pending storage; "gon=N" -
pages that were pending storage when we first looked, but weren't by the time
we got the object lock; "bsy=N" - pages that we ignored as they were actively
being written when we looked; and "can=N" - pages that we cancelled the storage
of.

What I'd really like to do is alter the behaviour of the cancellation
heuristics, depending on how necessary it is to expel pages.  If there are
plenty of other pages that aren't waiting to be written to the cache that
could be ejected first, then it would be nice to hold up on immediate
cancellation of cache writes - but I don't see a way of doing that.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/filesystems/caching/fscache.txt   |  4 ++
 Documentation/filesystems/caching/netfs-api.txt | 21 ++++++-
 fs/9p/cache.c                                   | 14 +----
 fs/afs/file.c                                   | 15 +----
 fs/fscache/internal.h                           |  5 ++
 fs/fscache/page.c                               | 79 ++++++++++++++++++++++++-
 fs/fscache/stats.c                              | 11 ++++
 fs/nfs/fscache.c                                | 10 +---
 include/linux/fscache-cache.h                   |  1 +
 include/linux/fscache.h                         | 27 +++++++++
 10 files changed, 152 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt
index 057a3c71d524..7097fd29fb3d 100644
--- a/Documentation/filesystems/caching/fscache.txt
+++ b/Documentation/filesystems/caching/fscache.txt
@@ -272,6 +272,10 @@ proc files.
 		pgs=N	Number of pages given store req processing time
 		rxd=N	Number of store reqs deleted from tracking tree
 		olm=N	Number of store reqs over store limit
+	VmScan	nos=N	Number of release reqs against pages with no pending store
+		gon=N	Number of release reqs against pages stored by time lock granted
+		bsy=N	Number of release reqs ignored due to in-progress store
+		can=N	Number of page stores cancelled due to release req
 	Ops	pend=N	Number of times async ops added to pending queues
 		run=N	Number of times async ops given CPU time
 		enq=N	Number of times async ops queued for processing
diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt
index 2666b1ed5e9e..1902c57b72ef 100644
--- a/Documentation/filesystems/caching/netfs-api.txt
+++ b/Documentation/filesystems/caching/netfs-api.txt
@@ -641,7 +641,7 @@ data file must be retired (see the relinquish cookie function below).
 
 Furthermore, note that this does not cancel the asynchronous read or write
 operation started by the read/alloc and write functions, so the page
-invalidation and release functions must use:
+invalidation functions must use:
 
 	bool fscache_check_page_write(struct fscache_cookie *cookie,
 				      struct page *page);
@@ -654,6 +654,25 @@ to see if a page is being written to the cache, and:
 to wait for it to finish if it is.
 
 
+When releasepage() is being implemented, a special FS-Cache function exists to
+manage the heuristics of coping with vmscan trying to eject pages, which may
+conflict with the cache trying to write pages to the cache (which may itself
+need to allocate memory):
+
+	bool fscache_maybe_release_page(struct fscache_cookie *cookie,
+					struct page *page,
+					gfp_t gfp);
+
+This takes the netfs cookie, and the page and gfp arguments as supplied to
+releasepage().  It will return false if the page cannot be released yet for
+some reason and if it returns true, the page has been uncached and can now be
+released.
+
+To make a page available for release, this function may wait for an outstanding
+storage request to complete, or it may attempt to cancel the storage request -
+in which case the page will not be stored in the cache this time.
+
+
 ==========================
 INDEX AND DATA FILE UPDATE
 ==========================
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 51c94e26a346..bcc5357a9069 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -343,18 +343,7 @@ int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
 
 	BUG_ON(!vcookie->fscache);
 
-	if (PageFsCache(page)) {
-		if (fscache_check_page_write(vcookie->fscache, page)) {
-			if (!(gfp & __GFP_WAIT))
-				return 0;
-			fscache_wait_on_page_write(vcookie->fscache, page);
-		}
-
-		fscache_uncache_page(vcookie->fscache, page);
-		ClearPageFsCache(page);
-	}
-
-	return 1;
+	return fscache_maybe_release_page(vnode->cache, page, gfp);
 }
 
 void __v9fs_fscache_invalidate_page(struct page *page)
@@ -368,7 +357,6 @@ void __v9fs_fscache_invalidate_page(struct page *page)
 		fscache_wait_on_page_write(vcookie->fscache, page);
 		BUG_ON(!PageLocked(page));
 		fscache_uncache_page(vcookie->fscache, page);
-		ClearPageFsCache(page);
 	}
 }
 
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 681c2a7b013f..39b301662f22 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -315,7 +315,6 @@ static void afs_invalidatepage(struct page *page, unsigned long offset)
 			struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
 			fscache_wait_on_page_write(vnode->cache, page);
 			fscache_uncache_page(vnode->cache, page);
-			ClearPageFsCache(page);
 		}
 #endif
 
@@ -349,17 +348,9 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags)
 	/* deny if page is being written to the cache and the caller hasn't
 	 * elected to wait */
 #ifdef CONFIG_AFS_FSCACHE
-	if (PageFsCache(page)) {
-		if (fscache_check_page_write(vnode->cache, page)) {
-			if (!(gfp_flags & __GFP_WAIT)) {
-				_leave(" = F [cache busy]");
-				return 0;
-			}
-			fscache_wait_on_page_write(vnode->cache, page);
-		}
-
-		fscache_uncache_page(vnode->cache, page);
-		ClearPageFsCache(page);
+	if (!fscache_maybe_release_page(vnode->cache, page, gfp_flags)) {
+		_leave(" = F [cache busy]");
+		return 0;
 	}
 #endif
 
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index a0769872b19c..e5046519b153 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -180,6 +180,11 @@ extern atomic_t fscache_n_store_pages;
 extern atomic_t fscache_n_store_radix_deletes;
 extern atomic_t fscache_n_store_pages_over_limit;
 
+extern atomic_t fscache_n_store_vmscan_not_storing;
+extern atomic_t fscache_n_store_vmscan_gone;
+extern atomic_t fscache_n_store_vmscan_busy;
+extern atomic_t fscache_n_store_vmscan_cancelled;
+
 extern atomic_t fscache_n_marks;
 extern atomic_t fscache_n_uncaches;
 
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 022a5da8e130..fc76798bd968 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -42,6 +42,75 @@ void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *pa
 }
 EXPORT_SYMBOL(__fscache_wait_on_page_write);
 
+/*
+ * decide whether a page can be released, possibly by cancelling a store to it
+ * - we're allowed to sleep if __GFP_WAIT is flagged
+ */
+bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
+				  struct page *page,
+				  gfp_t gfp)
+{
+	struct page *xpage;
+	void *val;
+
+	_enter("%p,%p,%x", cookie, page, gfp);
+
+	rcu_read_lock();
+	val = radix_tree_lookup(&cookie->stores, page->index);
+	if (!val) {
+		rcu_read_unlock();
+		fscache_stat(&fscache_n_store_vmscan_not_storing);
+		__fscache_uncache_page(cookie, page);
+		return true;
+	}
+
+	/* see if the page is actually undergoing storage - if so we can't get
+	 * rid of it till the cache has finished with it */
+	if (radix_tree_tag_get(&cookie->stores, page->index,
+			       FSCACHE_COOKIE_STORING_TAG)) {
+		rcu_read_unlock();
+		goto page_busy;
+	}
+
+	/* the page is pending storage, so we attempt to cancel the store and
+	 * discard the store request so that the page can be reclaimed */
+	spin_lock(&cookie->stores_lock);
+	rcu_read_unlock();
+
+	if (radix_tree_tag_get(&cookie->stores, page->index,
+			       FSCACHE_COOKIE_STORING_TAG)) {
+		/* the page started to undergo storage whilst we were looking,
+		 * so now we can only wait or return */
+		spin_unlock(&cookie->stores_lock);
+		goto page_busy;
+	}
+
+	xpage = radix_tree_delete(&cookie->stores, page->index);
+	spin_unlock(&cookie->stores_lock);
+
+	if (xpage) {
+		fscache_stat(&fscache_n_store_vmscan_cancelled);
+		fscache_stat(&fscache_n_store_radix_deletes);
+		ASSERTCMP(xpage, ==, page);
+	} else {
+		fscache_stat(&fscache_n_store_vmscan_gone);
+	}
+
+	wake_up_bit(&cookie->flags, 0);
+	if (xpage)
+		page_cache_release(xpage);
+	__fscache_uncache_page(cookie, page);
+	return true;
+
+page_busy:
+	/* we might want to wait here, but that could deadlock the allocator as
+	 * the slow-work threads writing to the cache may all end up sleeping
+	 * on memory allocation */
+	fscache_stat(&fscache_n_store_vmscan_busy);
+	return false;
+}
+EXPORT_SYMBOL(__fscache_maybe_release_page);
+
 /*
  * note that a page has finished being written to the cache
  */
@@ -57,6 +126,8 @@ static void fscache_end_page_write(struct fscache_object *object,
 		/* delete the page from the tree if it is now no longer
 		 * pending */
 		spin_lock(&cookie->stores_lock);
+		radix_tree_tag_clear(&cookie->stores, page->index,
+				     FSCACHE_COOKIE_STORING_TAG);
 		if (!radix_tree_tag_get(&cookie->stores, page->index,
 					FSCACHE_COOKIE_PENDING_TAG)) {
 			fscache_stat(&fscache_n_store_radix_deletes);
@@ -640,8 +711,12 @@ static void fscache_write_op(struct fscache_operation *_op)
 		goto superseded;
 	}
 
-	radix_tree_tag_clear(&cookie->stores, page->index,
-			     FSCACHE_COOKIE_PENDING_TAG);
+	if (page) {
+		radix_tree_tag_set(&cookie->stores, page->index,
+				   FSCACHE_COOKIE_STORING_TAG);
+		radix_tree_tag_clear(&cookie->stores, page->index,
+				     FSCACHE_COOKIE_PENDING_TAG);
+	}
 
 	spin_unlock(&cookie->stores_lock);
 	spin_unlock(&object->lock);
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 045ba396dbf2..cda69994e06d 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -63,6 +63,11 @@ atomic_t fscache_n_store_pages;
 atomic_t fscache_n_store_radix_deletes;
 atomic_t fscache_n_store_pages_over_limit;
 
+atomic_t fscache_n_store_vmscan_not_storing;
+atomic_t fscache_n_store_vmscan_gone;
+atomic_t fscache_n_store_vmscan_busy;
+atomic_t fscache_n_store_vmscan_cancelled;
+
 atomic_t fscache_n_marks;
 atomic_t fscache_n_uncaches;
 
@@ -211,6 +216,12 @@ static int fscache_stats_show(struct seq_file *m, void *v)
 		   atomic_read(&fscache_n_store_radix_deletes),
 		   atomic_read(&fscache_n_store_pages_over_limit));
 
+	seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u\n",
+		   atomic_read(&fscache_n_store_vmscan_not_storing),
+		   atomic_read(&fscache_n_store_vmscan_gone),
+		   atomic_read(&fscache_n_store_vmscan_busy),
+		   atomic_read(&fscache_n_store_vmscan_cancelled));
+
 	seq_printf(m, "Ops    : pend=%u run=%u enq=%u can=%u rej=%u\n",
 		   atomic_read(&fscache_n_op_pend),
 		   atomic_read(&fscache_n_op_run),
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 70fad69eb959..fa588006588d 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -359,17 +359,13 @@ int nfs_fscache_release_page(struct page *page, gfp_t gfp)
 
 	BUG_ON(!cookie);
 
-	if (fscache_check_page_write(cookie, page)) {
-		if (!(gfp & __GFP_WAIT))
-			return 0;
-		fscache_wait_on_page_write(cookie, page);
-	}
-
 	if (PageFsCache(page)) {
 		dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
 			 cookie, page, nfsi);
 
-		fscache_uncache_page(cookie, page);
+		if (!fscache_maybe_release_page(cookie, page, gfp))
+			return 0;
+
 		nfs_add_fscache_stats(page->mapping->host,
 				      NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
 	}
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index f3aa4bdafef6..4750d5fb419f 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -317,6 +317,7 @@ struct fscache_cookie {
 	void				*netfs_data;	/* back pointer to netfs */
 	struct radix_tree_root		stores;		/* pages to be stored on this cookie */
 #define FSCACHE_COOKIE_PENDING_TAG	0		/* pages tag: pending write to cache */
+#define FSCACHE_COOKIE_STORING_TAG	1		/* pages tag: writing to cache */
 
 	unsigned long			flags;
 #define FSCACHE_COOKIE_LOOKING_UP	0	/* T if non-index cookie being looked up still */
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 6d8ee466e0a0..595ce49288b7 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -202,6 +202,8 @@ extern int __fscache_write_page(struct fscache_cookie *, struct page *, gfp_t);
 extern void __fscache_uncache_page(struct fscache_cookie *, struct page *);
 extern bool __fscache_check_page_write(struct fscache_cookie *, struct page *);
 extern void __fscache_wait_on_page_write(struct fscache_cookie *, struct page *);
+extern bool __fscache_maybe_release_page(struct fscache_cookie *, struct page *,
+					 gfp_t);
 
 /**
  * fscache_register_netfs - Register a filesystem as desiring caching services
@@ -615,4 +617,29 @@ void fscache_wait_on_page_write(struct fscache_cookie *cookie,
 		__fscache_wait_on_page_write(cookie, page);
 }
 
+/**
+ * fscache_maybe_release_page - Consider releasing a page, cancelling a store
+ * @cookie: The cookie representing the cache object
+ * @page: The netfs page that is being cached.
+ * @gfp: The gfp flags passed to releasepage()
+ *
+ * Consider releasing a page for the vmscan algorithm, on behalf of the netfs's
+ * releasepage() call.  A storage request on the page may cancelled if it is
+ * not currently being processed.
+ *
+ * The function returns true if the page no longer has a storage request on it,
+ * and false if a storage request is left in place.  If true is returned, the
+ * page will have been passed to fscache_uncache_page().  If false is returned
+ * the page cannot be freed yet.
+ */
+static inline
+bool fscache_maybe_release_page(struct fscache_cookie *cookie,
+				struct page *page,
+				gfp_t gfp)
+{
+	if (fscache_cookie_valid(cookie) && PageFsCache(page))
+		return __fscache_maybe_release_page(cookie, page, gfp);
+	return false;
+}
+
 #endif /* _LINUX_FSCACHE_H */
-- 
cgit v1.2.3


From 60d543ca724be155c2b6166e36a00c80b21bd810 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 19 Nov 2009 18:11:45 +0000
Subject: FS-Cache: Start processing an object's operations on that object's
 death

Start processing an object's operations when that object moves into the DYING
state as the object cannot be destroyed until all its outstanding operations
have completed.

Furthermore, make sure that read and allocation operations handle being woken
up on a dead object.  Such events are recorded in the Allocs.abt and
Retrvls.abt statistics as viewable through /proc/fs/fscache/stats.

The code for waiting for object activation for the read and allocation
operations is also extracted into its own function as it is much the same in
all cases, differing only in the stats incremented.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/filesystems/caching/fscache.txt |   2 +
 fs/fscache/internal.h                         |   5 ++
 fs/fscache/object.c                           |   1 +
 fs/fscache/page.c                             | 112 +++++++++++++-------------
 fs/fscache/stats.c                            |  12 ++-
 include/linux/fscache-cache.h                 |   4 +
 6 files changed, 75 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt
index 7097fd29fb3d..3c23411956bb 100644
--- a/Documentation/filesystems/caching/fscache.txt
+++ b/Documentation/filesystems/caching/fscache.txt
@@ -253,6 +253,7 @@ proc files.
 		int=N	Number of alloc reqs aborted -ERESTARTSYS
 		ops=N	Number of alloc reqs submitted
 		owt=N	Number of alloc reqs waited for CPU time
+		abt=N	Number of alloc reqs aborted due to object death
 	Retrvls	n=N	Number of retrieval (read) requests seen
 		ok=N	Number of successful retr reqs
 		wt=N	Number of retr reqs that waited on lookup completion
@@ -262,6 +263,7 @@ proc files.
 		oom=N	Number of retr reqs failed -ENOMEM
 		ops=N	Number of retr reqs submitted
 		owt=N	Number of retr reqs waited for CPU time
+		abt=N	Number of retr reqs aborted due to object death
 	Stores	n=N	Number of storage (write) requests seen
 		ok=N	Number of successful store reqs
 		agn=N	Number of store reqs on a page already pending storage
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 2bf463d26080..5b49a373689b 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -156,6 +156,7 @@ extern atomic_t fscache_n_allocs_ok;
 extern atomic_t fscache_n_allocs_wait;
 extern atomic_t fscache_n_allocs_nobufs;
 extern atomic_t fscache_n_allocs_intr;
+extern atomic_t fscache_n_allocs_object_dead;
 extern atomic_t fscache_n_alloc_ops;
 extern atomic_t fscache_n_alloc_op_waits;
 
@@ -166,6 +167,7 @@ extern atomic_t fscache_n_retrievals_nodata;
 extern atomic_t fscache_n_retrievals_nobufs;
 extern atomic_t fscache_n_retrievals_intr;
 extern atomic_t fscache_n_retrievals_nomem;
+extern atomic_t fscache_n_retrievals_object_dead;
 extern atomic_t fscache_n_retrieval_ops;
 extern atomic_t fscache_n_retrieval_op_waits;
 
@@ -249,9 +251,12 @@ static inline void fscache_stat_d(atomic_t *stat)
 	atomic_dec(stat);
 }
 
+#define __fscache_stat(stat) (stat)
+
 extern const struct file_operations fscache_stats_fops;
 #else
 
+#define __fscache_stat(stat) (NULL)
 #define fscache_stat(stat) do {} while (0)
 #endif
 
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 74bc562a2cbc..c85c9f582166 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -201,6 +201,7 @@ static void fscache_object_state_machine(struct fscache_object *object)
 		}
 		spin_unlock(&object->lock);
 		fscache_enqueue_dependents(object);
+		fscache_start_operations(object);
 		goto terminal_transit;
 
 		/* handle an abort during initialisation */
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index fc76798bd968..c598ea4c4e7d 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -313,6 +313,43 @@ static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
 	return 0;
 }
 
+/*
+ * wait for an object to become active (or dead)
+ */
+static int fscache_wait_for_retrieval_activation(struct fscache_object *object,
+						 struct fscache_retrieval *op,
+						 atomic_t *stat_op_waits,
+						 atomic_t *stat_object_dead)
+{
+	int ret;
+
+	if (!test_bit(FSCACHE_OP_WAITING, &op->op.flags))
+		goto check_if_dead;
+
+	_debug(">>> WT");
+	fscache_stat(stat_op_waits);
+	if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+			fscache_wait_bit_interruptible,
+			TASK_INTERRUPTIBLE) < 0) {
+		ret = fscache_cancel_op(&op->op);
+		if (ret == 0)
+			return -ERESTARTSYS;
+
+		/* it's been removed from the pending queue by another party,
+		 * so we should get to run shortly */
+		wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+			    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+	}
+	_debug("<<< GO");
+
+check_if_dead:
+	if (unlikely(fscache_object_is_dead(object))) {
+		fscache_stat(stat_object_dead);
+		return -ENOBUFS;
+	}
+	return 0;
+}
+
 /*
  * read a page from the cache or allocate a block in which to store it
  * - we return:
@@ -376,25 +413,12 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 
 	/* we wait for the operation to become active, and then process it
 	 * *here*, in this thread, and not in the thread pool */
-	if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
-		_debug(">>> WT");
-		fscache_stat(&fscache_n_retrieval_op_waits);
-		if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
-				fscache_wait_bit_interruptible,
-				TASK_INTERRUPTIBLE) < 0) {
-			ret = fscache_cancel_op(&op->op);
-			if (ret == 0) {
-				ret = -ERESTARTSYS;
-				goto error;
-			}
-
-			/* it's been removed from the pending queue by another
-			 * party, so we should get to run shortly */
-			wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
-				    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
-		}
-		_debug("<<< GO");
-	}
+	ret = fscache_wait_for_retrieval_activation(
+		object, op,
+		__fscache_stat(&fscache_n_retrieval_op_waits),
+		__fscache_stat(&fscache_n_retrievals_object_dead));
+	if (ret < 0)
+		goto error;
 
 	/* ask the cache to honour the operation */
 	if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
@@ -506,25 +530,12 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 
 	/* we wait for the operation to become active, and then process it
 	 * *here*, in this thread, and not in the thread pool */
-	if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
-		_debug(">>> WT");
-		fscache_stat(&fscache_n_retrieval_op_waits);
-		if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
-				fscache_wait_bit_interruptible,
-				TASK_INTERRUPTIBLE) < 0) {
-			ret = fscache_cancel_op(&op->op);
-			if (ret == 0) {
-				ret = -ERESTARTSYS;
-				goto error;
-			}
-
-			/* it's been removed from the pending queue by another
-			 * party, so we should get to run shortly */
-			wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
-				    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
-		}
-		_debug("<<< GO");
-	}
+	ret = fscache_wait_for_retrieval_activation(
+		object, op,
+		__fscache_stat(&fscache_n_retrieval_op_waits),
+		__fscache_stat(&fscache_n_retrievals_object_dead));
+	if (ret < 0)
+		goto error;
 
 	/* ask the cache to honour the operation */
 	if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
@@ -612,25 +623,12 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
 
 	fscache_stat(&fscache_n_alloc_ops);
 
-	if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
-		_debug(">>> WT");
-		fscache_stat(&fscache_n_alloc_op_waits);
-		if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
-				fscache_wait_bit_interruptible,
-				TASK_INTERRUPTIBLE) < 0) {
-			ret = fscache_cancel_op(&op->op);
-			if (ret == 0) {
-				ret = -ERESTARTSYS;
-				goto error;
-			}
-
-			/* it's been removed from the pending queue by another
-			 * party, so we should get to run shortly */
-			wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
-				    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
-		}
-		_debug("<<< GO");
-	}
+	ret = fscache_wait_for_retrieval_activation(
+		object, op,
+		__fscache_stat(&fscache_n_alloc_op_waits),
+		__fscache_stat(&fscache_n_allocs_object_dead));
+	if (ret < 0)
+		goto error;
 
 	/* ask the cache to honour the operation */
 	fscache_stat(&fscache_n_cop_allocate_page);
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 9e15289eb5c1..05f77caf4a2d 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -39,6 +39,7 @@ atomic_t fscache_n_allocs_ok;
 atomic_t fscache_n_allocs_wait;
 atomic_t fscache_n_allocs_nobufs;
 atomic_t fscache_n_allocs_intr;
+atomic_t fscache_n_allocs_object_dead;
 atomic_t fscache_n_alloc_ops;
 atomic_t fscache_n_alloc_op_waits;
 
@@ -49,6 +50,7 @@ atomic_t fscache_n_retrievals_nodata;
 atomic_t fscache_n_retrievals_nobufs;
 atomic_t fscache_n_retrievals_intr;
 atomic_t fscache_n_retrievals_nomem;
+atomic_t fscache_n_retrievals_object_dead;
 atomic_t fscache_n_retrieval_ops;
 atomic_t fscache_n_retrieval_op_waits;
 
@@ -188,9 +190,10 @@ static int fscache_stats_show(struct seq_file *m, void *v)
 		   atomic_read(&fscache_n_allocs_wait),
 		   atomic_read(&fscache_n_allocs_nobufs),
 		   atomic_read(&fscache_n_allocs_intr));
-	seq_printf(m, "Allocs : ops=%u owt=%u\n",
+	seq_printf(m, "Allocs : ops=%u owt=%u abt=%u\n",
 		   atomic_read(&fscache_n_alloc_ops),
-		   atomic_read(&fscache_n_alloc_op_waits));
+		   atomic_read(&fscache_n_alloc_op_waits),
+		   atomic_read(&fscache_n_allocs_object_dead));
 
 	seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u"
 		   " int=%u oom=%u\n",
@@ -201,9 +204,10 @@ static int fscache_stats_show(struct seq_file *m, void *v)
 		   atomic_read(&fscache_n_retrievals_nobufs),
 		   atomic_read(&fscache_n_retrievals_intr),
 		   atomic_read(&fscache_n_retrievals_nomem));
-	seq_printf(m, "Retrvls: ops=%u owt=%u\n",
+	seq_printf(m, "Retrvls: ops=%u owt=%u abt=%u\n",
 		   atomic_read(&fscache_n_retrieval_ops),
-		   atomic_read(&fscache_n_retrieval_op_waits));
+		   atomic_read(&fscache_n_retrieval_op_waits),
+		   atomic_read(&fscache_n_retrievals_object_dead));
 
 	seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n",
 		   atomic_read(&fscache_n_stores),
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 4750d5fb419f..907bb56c5888 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -404,6 +404,10 @@ extern const char *fscache_object_states[];
 	 (obj)->state >= FSCACHE_OBJECT_AVAILABLE &&	      \
 	 (obj)->state < FSCACHE_OBJECT_DYING)
 
+#define fscache_object_is_dead(obj)				\
+	(test_bit(FSCACHE_IOERROR, &(obj)->cache->flags) &&	\
+	 (obj)->state >= FSCACHE_OBJECT_DYING)
+
 extern const struct slow_work_ops fscache_object_slow_work_ops;
 
 /**
-- 
cgit v1.2.3


From a17754fb8c28af19cd70dcbec6d5b0773b94e0c1 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 19 Nov 2009 18:11:52 +0000
Subject: CacheFiles: Don't write a full page if there's only a partial page to
 cache

cachefiles_write_page() writes a full page to the backing file for the last
page of the netfs file, even if the netfs file's last page is only a partial
page.

This causes the EOF on the backing file to be extended beyond the EOF of the
netfs, and thus the backing file will be truncated by cachefiles_attr_changed()
called from cachefiles_lookup_object().

So we need to limit the write we make to the backing file on that last page
such that it doesn't push the EOF too far.

Also, if a backing file that has a partial page at the end is expanded, we
discard the partial page and refetch it on the basis that we then have a hole
in the file with invalid data, and should the power go out...  A better way to
deal with this could be to record a note that the partial page contains invalid
data until the correct data is written into it.

This isn't a problem for netfs's that discard the whole backing file if the
file size changes (such as NFS).

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/cachefiles/interface.c     | 20 +++++++++++++++++---
 fs/cachefiles/rdwr.c          | 23 +++++++++++++++++++----
 include/linux/fscache-cache.h |  3 +++
 3 files changed, 39 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index dd7f852746cb..8e67abf05985 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -404,12 +404,26 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
 	if (oi_size == ni_size)
 		return 0;
 
-	newattrs.ia_size = ni_size;
-	newattrs.ia_valid = ATTR_SIZE;
-
 	cachefiles_begin_secure(cache, &saved_cred);
 	mutex_lock(&object->backer->d_inode->i_mutex);
+
+	/* if there's an extension to a partial page at the end of the backing
+	 * file, we need to discard the partial page so that we pick up new
+	 * data after it */
+	if (oi_size & ~PAGE_MASK && ni_size > oi_size) {
+		_debug("discard tail %llx", oi_size);
+		newattrs.ia_valid = ATTR_SIZE;
+		newattrs.ia_size = oi_size & PAGE_MASK;
+		ret = notify_change(object->backer, &newattrs);
+		if (ret < 0)
+			goto truncate_failed;
+	}
+
+	newattrs.ia_valid = ATTR_SIZE;
+	newattrs.ia_size = ni_size;
 	ret = notify_change(object->backer, &newattrs);
+
+truncate_failed:
 	mutex_unlock(&object->backer->d_inode->i_mutex);
 	cachefiles_end_secure(cache, saved_cred);
 
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 3304646dae84..5a84fd7109ad 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -803,7 +803,8 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
 	struct cachefiles_cache *cache;
 	mm_segment_t old_fs;
 	struct file *file;
-	loff_t pos;
+	loff_t pos, eof;
+	size_t len;
 	void *data;
 	int ret;
 
@@ -837,15 +838,29 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
 		ret = -EIO;
 		if (file->f_op->write) {
 			pos = (loff_t) page->index << PAGE_SHIFT;
+
+			/* we mustn't write more data than we have, so we have
+			 * to beware of a partial page at EOF */
+			eof = object->fscache.store_limit_l;
+			len = PAGE_SIZE;
+			if (eof & ~PAGE_MASK) {
+				ASSERTCMP(pos, <, eof);
+				if (eof - pos < PAGE_SIZE) {
+					_debug("cut short %llx to %llx",
+					       pos, eof);
+					len = eof - pos;
+					ASSERTCMP(pos + len, ==, eof);
+				}
+			}
+
 			data = kmap(page);
 			old_fs = get_fs();
 			set_fs(KERNEL_DS);
 			ret = file->f_op->write(
-				file, (const void __user *) data, PAGE_SIZE,
-				&pos);
+				file, (const void __user *) data, len, &pos);
 			set_fs(old_fs);
 			kunmap(page);
-			if (ret != PAGE_SIZE)
+			if (ret != len)
 				ret = -EIO;
 		}
 		fput(file);
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 907bb56c5888..5db50002f3b5 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -395,6 +395,7 @@ struct fscache_object {
 	struct rb_node		objlist_link;	/* link in global object list */
 #endif
 	pgoff_t			store_limit;	/* current storage limit */
+	loff_t			store_limit_l;	/* current storage limit */
 };
 
 extern const char *fscache_object_states[];
@@ -439,6 +440,7 @@ void fscache_object_init(struct fscache_object *object,
 	object->events = object->event_mask = 0;
 	object->flags = 0;
 	object->store_limit = 0;
+	object->store_limit_l = 0;
 	object->cache = cache;
 	object->cookie = cookie;
 	object->parent = NULL;
@@ -491,6 +493,7 @@ static inline void fscache_object_lookup_error(struct fscache_object *object)
 static inline
 void fscache_set_store_limit(struct fscache_object *object, loff_t i_size)
 {
+	object->store_limit_l = i_size;
 	object->store_limit = i_size >> PAGE_SHIFT;
 	if (i_size & ~PAGE_MASK)
 		object->store_limit++;
-- 
cgit v1.2.3


From fee096deb4f33897937b974cb2c5168bab7935be Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 19 Nov 2009 18:12:05 +0000
Subject: CacheFiles: Catch an overly long wait for an old active object

Catch an overly long wait for an old, dying active object when we want to
replace it with a new one.  The probability is that all the slow-work threads
are hogged, and the delete can't get a look in.

What we do instead is:

 (1) if there's nothing in the slow work queue, we sleep until either the dying
     object has finished dying or there is something in the slow work queue
     behind which we can queue our object.

 (2) if there is something in the slow work queue, we return ETIMEDOUT to
     fscache_lookup_object(), which then puts us back on the slow work queue,
     presumably behind the deletion that we're blocked by.  We are then
     deferred for a while until we work our way back through the queue -
     without blocking a slow-work thread unnecessarily.

A backtrace similar to the following may appear in the log without this patch:

	INFO: task kslowd004:5711 blocked for more than 120 seconds.
	"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
	kslowd004     D 0000000000000000     0  5711      2 0x00000080
	 ffff88000340bb80 0000000000000046 ffff88002550d000 0000000000000000
	 ffff88002550d000 0000000000000007 ffff88000340bfd8 ffff88002550d2a8
	 000000000000ddf0 00000000000118c0 00000000000118c0 ffff88002550d2a8
	Call Trace:
	 [<ffffffff81058e21>] ? trace_hardirqs_on+0xd/0xf
	 [<ffffffffa011c4d8>] ? cachefiles_wait_bit+0x0/0xd [cachefiles]
	 [<ffffffffa011c4e1>] cachefiles_wait_bit+0x9/0xd [cachefiles]
	 [<ffffffff81353153>] __wait_on_bit+0x43/0x76
	 [<ffffffff8111ae39>] ? ext3_xattr_get+0x1ec/0x270
	 [<ffffffff813531ef>] out_of_line_wait_on_bit+0x69/0x74
	 [<ffffffffa011c4d8>] ? cachefiles_wait_bit+0x0/0xd [cachefiles]
	 [<ffffffff8104c125>] ? wake_bit_function+0x0/0x2e
	 [<ffffffffa011bc79>] cachefiles_mark_object_active+0x203/0x23b [cachefiles]
	 [<ffffffffa011c209>] cachefiles_walk_to_object+0x558/0x827 [cachefiles]
	 [<ffffffffa011a429>] cachefiles_lookup_object+0xac/0x12a [cachefiles]
	 [<ffffffffa00aa1e9>] fscache_lookup_object+0x1c7/0x214 [fscache]
	 [<ffffffffa00aafc5>] fscache_object_state_machine+0xa5/0x52d [fscache]
	 [<ffffffffa00ab4ac>] fscache_object_slow_work_execute+0x5f/0xa0 [fscache]
	 [<ffffffff81082093>] slow_work_execute+0x18f/0x2d1
	 [<ffffffff8108239a>] slow_work_thread+0x1c5/0x308
	 [<ffffffff8104c0f1>] ? autoremove_wake_function+0x0/0x34
	 [<ffffffff810821d5>] ? slow_work_thread+0x0/0x308
	 [<ffffffff8104be91>] kthread+0x7a/0x82
	 [<ffffffff8100beda>] child_rip+0xa/0x20
	 [<ffffffff8100b87c>] ? restore_args+0x0/0x30
	 [<ffffffff8104be17>] ? kthread+0x0/0x82
	 [<ffffffff8100bed0>] ? child_rip+0x0/0x20
	1 lock held by kslowd004/5711:
	 #0:  (&sb->s_type->i_mutex_key#7/1){+.+.+.}, at: [<ffffffffa011be64>] cachefiles_walk_to_object+0x1b3/0x827 [cachefiles]

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/filesystems/caching/fscache.txt |  1 +
 fs/cachefiles/interface.c                     |  6 +-
 fs/cachefiles/namei.c                         | 87 +++++++++++++++++++++------
 fs/fscache/internal.h                         |  1 +
 fs/fscache/object.c                           | 10 ++-
 fs/fscache/stats.c                            |  4 +-
 include/linux/fscache-cache.h                 |  6 +-
 7 files changed, 90 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt
index 3c23411956bb..a91e2e2095b0 100644
--- a/Documentation/filesystems/caching/fscache.txt
+++ b/Documentation/filesystems/caching/fscache.txt
@@ -235,6 +235,7 @@ proc files.
 		neg=N	Number of negative lookups made
 		pos=N	Number of positive lookups made
 		crt=N	Number of objects created by lookup
+		tmo=N	Number of lookups timed out and requeued
 	Updates	n=N	Number of update cookie requests seen
 		nul=N	Number of upd reqs given a NULL parent
 		run=N	Number of upd reqs granted CPU time
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 8e67abf05985..9d3c426044ae 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -114,8 +114,9 @@ nomem_lookup_data:
 
 /*
  * attempt to look up the nominated node in this cache
+ * - return -ETIMEDOUT to be scheduled again
  */
-static void cachefiles_lookup_object(struct fscache_object *_object)
+static int cachefiles_lookup_object(struct fscache_object *_object)
 {
 	struct cachefiles_lookup_data *lookup_data;
 	struct cachefiles_object *parent, *object;
@@ -145,13 +146,14 @@ static void cachefiles_lookup_object(struct fscache_object *_object)
 	    object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX)
 		cachefiles_attr_changed(&object->fscache);
 
-	if (ret < 0) {
+	if (ret < 0 && ret != -ETIMEDOUT) {
 		printk(KERN_WARNING "CacheFiles: Lookup failed error %d\n",
 		       ret);
 		fscache_object_lookup_error(&object->fscache);
 	}
 
 	_leave(" [%d]", ret);
+	return ret;
 }
 
 /*
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 00a0cda8f47a..14ac4806e291 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -21,12 +21,6 @@
 #include <linux/security.h>
 #include "internal.h"
 
-static int cachefiles_wait_bit(void *flags)
-{
-	schedule();
-	return 0;
-}
-
 #define CACHEFILES_KEYBUF_SIZE 512
 
 /*
@@ -100,8 +94,8 @@ static noinline void cachefiles_printk_object(struct cachefiles_object *object,
 /*
  * record the fact that an object is now active
  */
-static void cachefiles_mark_object_active(struct cachefiles_cache *cache,
-					  struct cachefiles_object *object)
+static int cachefiles_mark_object_active(struct cachefiles_cache *cache,
+					 struct cachefiles_object *object)
 {
 	struct cachefiles_object *xobject;
 	struct rb_node **_p, *_parent = NULL;
@@ -139,8 +133,8 @@ try_again:
 	rb_insert_color(&object->active_node, &cache->active_nodes);
 
 	write_unlock(&cache->active_lock);
-	_leave("");
-	return;
+	_leave(" = 0");
+	return 0;
 
 	/* an old object from a previous incarnation is hogging the slot - we
 	 * need to wait for it to be destroyed */
@@ -155,13 +149,64 @@ wait_for_old_object:
 	atomic_inc(&xobject->usage);
 	write_unlock(&cache->active_lock);
 
-	_debug(">>> wait");
-	wait_on_bit(&xobject->flags, CACHEFILES_OBJECT_ACTIVE,
-		    cachefiles_wait_bit, TASK_UNINTERRUPTIBLE);
-	_debug("<<< waited");
+	if (test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) {
+		wait_queue_head_t *wq;
+
+		signed long timeout = 60 * HZ;
+		wait_queue_t wait;
+		bool requeue;
+
+		/* if the object we're waiting for is queued for processing,
+		 * then just put ourselves on the queue behind it */
+		if (slow_work_is_queued(&xobject->fscache.work)) {
+			_debug("queue OBJ%x behind OBJ%x immediately",
+			       object->fscache.debug_id,
+			       xobject->fscache.debug_id);
+			goto requeue;
+		}
+
+		/* otherwise we sleep until either the object we're waiting for
+		 * is done, or the slow-work facility wants the thread back to
+		 * do other work */
+		wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE);
+		init_wait(&wait);
+		requeue = false;
+		do {
+			prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+			if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags))
+				break;
+			requeue = slow_work_sleep_till_thread_needed(
+				&object->fscache.work, &timeout);
+		} while (timeout > 0 && !requeue);
+		finish_wait(wq, &wait);
+
+		if (requeue &&
+		    test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) {
+			_debug("queue OBJ%x behind OBJ%x after wait",
+			       object->fscache.debug_id,
+			       xobject->fscache.debug_id);
+			goto requeue;
+		}
+
+		if (timeout <= 0) {
+			printk(KERN_ERR "\n");
+			printk(KERN_ERR "CacheFiles: Error: Overlong"
+			       " wait for old active object to go away\n");
+			cachefiles_printk_object(object, xobject);
+			goto requeue;
+		}
+	}
+
+	ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags));
 
 	cache->cache.ops->put_object(&xobject->fscache);
 	goto try_again;
+
+requeue:
+	clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
+	cache->cache.ops->put_object(&xobject->fscache);
+	_leave(" = -ETIMEDOUT");
+	return -ETIMEDOUT;
 }
 
 /*
@@ -466,12 +511,15 @@ lookup_again:
 	}
 
 	/* note that we're now using this object */
-	cachefiles_mark_object_active(cache, object);
+	ret = cachefiles_mark_object_active(cache, object);
 
 	mutex_unlock(&dir->d_inode->i_mutex);
 	dput(dir);
 	dir = NULL;
 
+	if (ret == -ETIMEDOUT)
+		goto mark_active_timed_out;
+
 	_debug("=== OBTAINED_OBJECT ===");
 
 	if (object->new) {
@@ -515,6 +563,10 @@ create_error:
 		cachefiles_io_error(cache, "Create/mkdir failed");
 	goto error;
 
+mark_active_timed_out:
+	_debug("mark active timed out");
+	goto release_dentry;
+
 check_error:
 	_debug("check error %d", ret);
 	write_lock(&cache->active_lock);
@@ -522,7 +574,7 @@ check_error:
 	clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
 	wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
 	write_unlock(&cache->active_lock);
-
+release_dentry:
 	dput(object->dentry);
 	object->dentry = NULL;
 	goto error_out;
@@ -543,9 +595,6 @@ error:
 error_out2:
 	dput(dir);
 error_out:
-	if (ret == -ENOSPC)
-		ret = -ENOBUFS;
-
 	_leave(" = error %d", -ret);
 	return ret;
 }
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 5b49a373689b..0ca2566e038c 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -215,6 +215,7 @@ extern atomic_t fscache_n_object_no_alloc;
 extern atomic_t fscache_n_object_lookups;
 extern atomic_t fscache_n_object_lookups_negative;
 extern atomic_t fscache_n_object_lookups_positive;
+extern atomic_t fscache_n_object_lookups_timed_out;
 extern atomic_t fscache_n_object_created;
 extern atomic_t fscache_n_object_avail;
 extern atomic_t fscache_n_object_dead;
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index f3f952cf887e..e513ac599c8e 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -468,6 +468,7 @@ static void fscache_lookup_object(struct fscache_object *object)
 {
 	struct fscache_cookie *cookie = object->cookie;
 	struct fscache_object *parent;
+	int ret;
 
 	_enter("");
 
@@ -493,12 +494,19 @@ static void fscache_lookup_object(struct fscache_object *object)
 
 	fscache_stat(&fscache_n_object_lookups);
 	fscache_stat(&fscache_n_cop_lookup_object);
-	object->cache->ops->lookup_object(object);
+	ret = object->cache->ops->lookup_object(object);
 	fscache_stat_d(&fscache_n_cop_lookup_object);
 
 	if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events))
 		set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
 
+	if (ret == -ETIMEDOUT) {
+		/* probably stuck behind another object, so move this one to
+		 * the back of the queue */
+		fscache_stat(&fscache_n_object_lookups_timed_out);
+		set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+	}
+
 	_leave("");
 }
 
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 05f77caf4a2d..46435f3aae68 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -98,6 +98,7 @@ atomic_t fscache_n_object_no_alloc;
 atomic_t fscache_n_object_lookups;
 atomic_t fscache_n_object_lookups_negative;
 atomic_t fscache_n_object_lookups_positive;
+atomic_t fscache_n_object_lookups_timed_out;
 atomic_t fscache_n_object_created;
 atomic_t fscache_n_object_avail;
 atomic_t fscache_n_object_dead;
@@ -160,10 +161,11 @@ static int fscache_stats_show(struct seq_file *m, void *v)
 		   atomic_read(&fscache_n_acquires_nobufs),
 		   atomic_read(&fscache_n_acquires_oom));
 
-	seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u\n",
+	seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u tmo=%u\n",
 		   atomic_read(&fscache_n_object_lookups),
 		   atomic_read(&fscache_n_object_lookups_negative),
 		   atomic_read(&fscache_n_object_lookups_positive),
+		   atomic_read(&fscache_n_object_lookups_timed_out),
 		   atomic_read(&fscache_n_object_created));
 
 	seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 5db50002f3b5..7be0c6fbe880 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -234,8 +234,10 @@ struct fscache_cache_ops {
 	struct fscache_object *(*alloc_object)(struct fscache_cache *cache,
 					       struct fscache_cookie *cookie);
 
-	/* look up the object for a cookie */
-	void (*lookup_object)(struct fscache_object *object);
+	/* look up the object for a cookie
+	 * - return -ETIMEDOUT to be requeued
+	 */
+	int (*lookup_object)(struct fscache_object *object);
 
 	/* finished looking up */
 	void (*lookup_complete)(struct fscache_object *object);
-- 
cgit v1.2.3


From 8964be4a9a5ca8cab1219bb046db2f6d1936227c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 20 Nov 2009 15:35:04 -0800
Subject: net: rename skb->iif to skb->skb_iif

To help grep games, rename iif to skb_iif

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ieee802154/fakehard.c     | 2 +-
 drivers/net/ifb.c                 | 6 +++---
 include/linux/skbuff.h            | 4 ++--
 include/net/pkt_cls.h             | 4 ++--
 net/core/dev.c                    | 6 +++---
 net/core/skbuff.c                 | 2 +-
 net/netlabel/netlabel_unlabeled.c | 2 +-
 net/sched/act_mirred.c            | 2 +-
 net/sched/cls_flow.c              | 2 +-
 security/selinux/hooks.c          | 6 +++---
 security/smack/smack_lsm.c        | 4 ++--
 11 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ieee802154/fakehard.c b/drivers/ieee802154/fakehard.c
index f877f13e3ab3..617549f30ef9 100644
--- a/drivers/ieee802154/fakehard.c
+++ b/drivers/ieee802154/fakehard.c
@@ -282,7 +282,7 @@ static int ieee802154_fake_close(struct net_device *dev)
 static netdev_tx_t ieee802154_fake_xmit(struct sk_buff *skb,
 					      struct net_device *dev)
 {
-	skb->iif = dev->ifindex;
+	skb->skb_iif = dev->ifindex;
 	skb->dev = dev;
 	dev->stats.tx_packets++;
 	dev->stats.tx_bytes += skb->len;
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 69c25668dd63..f4081c0a2d9c 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -99,7 +99,7 @@ static void ri_tasklet(unsigned long dev)
 		stats->tx_bytes +=skb->len;
 
 		rcu_read_lock();
-		skb->dev = dev_get_by_index_rcu(&init_net, skb->iif);
+		skb->dev = dev_get_by_index_rcu(&init_net, skb->skb_iif);
 		if (!skb->dev) {
 			rcu_read_unlock();
 			dev_kfree_skb(skb);
@@ -107,7 +107,7 @@ static void ri_tasklet(unsigned long dev)
 			break;
 		}
 		rcu_read_unlock();
-		skb->iif = _dev->ifindex;
+		skb->skb_iif = _dev->ifindex;
 
 		if (from & AT_EGRESS) {
 			dp->st_rx_frm_egr++;
@@ -172,7 +172,7 @@ static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 	stats->rx_packets++;
 	stats->rx_bytes+=skb->len;
 
-	if (!(from & (AT_INGRESS|AT_EGRESS)) || !skb->iif) {
+	if (!(from & (AT_INGRESS|AT_EGRESS)) || !skb->skb_iif) {
 		dev_kfree_skb(skb);
 		stats->rx_dropped++;
 		return NETDEV_TX_OK;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 63f47426977a..89eed8cdd318 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -299,7 +299,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@nfctinfo: Relationship of this skb to the connection
  *	@nfct_reasm: netfilter conntrack re-assembly pointer
  *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
- *	@iif: ifindex of device we arrived on
+ *	@skb_iif: ifindex of device we arrived on
  *	@queue_mapping: Queue mapping for multiqueue devices
  *	@tc_index: Traffic control index
  *	@tc_verd: traffic control verdict
@@ -366,7 +366,7 @@ struct sk_buff {
 	struct nf_bridge_info	*nf_bridge;
 #endif
 
-	int			iif;
+	int			skb_iif;
 #ifdef CONFIG_NET_SCHED
 	__u16			tc_index;	/* traffic control index */
 #ifdef CONFIG_NET_CLS_ACT
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 3dd210d073ca..dd3031aed9d5 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -343,9 +343,9 @@ tcf_match_indev(struct sk_buff *skb, char *indev)
 	struct net_device *dev;
 
 	if (indev[0]) {
-		if  (!skb->iif)
+		if  (!skb->skb_iif)
 			return 0;
-		dev = __dev_get_by_index(dev_net(skb->dev), skb->iif);
+		dev = __dev_get_by_index(dev_net(skb->dev), skb->skb_iif);
 		if (!dev || strcmp(indev, dev->name))
 			return 0;
 	}
diff --git a/net/core/dev.c b/net/core/dev.c
index 9977288583b8..09f3d6b9c0c8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2287,7 +2287,7 @@ static int ing_filter(struct sk_buff *skb)
 	if (MAX_RED_LOOP < ttl++) {
 		printk(KERN_WARNING
 		       "Redir loop detected Dropping packet (%d->%d)\n",
-		       skb->iif, dev->ifindex);
+		       skb->skb_iif, dev->ifindex);
 		return TC_ACT_SHOT;
 	}
 
@@ -2395,8 +2395,8 @@ int netif_receive_skb(struct sk_buff *skb)
 	if (netpoll_receive_skb(skb))
 		return NET_RX_DROP;
 
-	if (!skb->iif)
-		skb->iif = skb->dev->ifindex;
+	if (!skb->skb_iif)
+		skb->skb_iif = skb->dev->ifindex;
 
 	null_or_orig = NULL;
 	orig_dev = skb->dev;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 739b8f4dd327..bfa3e7865a8c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -549,7 +549,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #endif
 	new->protocol		= old->protocol;
 	new->mark		= old->mark;
-	new->iif		= old->iif;
+	new->skb_iif		= old->skb_iif;
 	__nf_copy(new, old);
 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 3dfe2bac8623..98ed22ee2ff4 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -1550,7 +1550,7 @@ int netlbl_unlabel_getattr(const struct sk_buff *skb,
 	struct netlbl_unlhsh_iface *iface;
 
 	rcu_read_lock();
-	iface = netlbl_unlhsh_search_iface_def(skb->iif);
+	iface = netlbl_unlhsh_search_iface_def(skb->skb_iif);
 	if (iface == NULL)
 		goto unlabel_getattr_nolabel;
 	switch (family) {
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 797479369881..d329170243cb 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -185,7 +185,7 @@ static int tcf_mirred(struct sk_buff *skb, struct tc_action *a,
 		skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at);
 
 	skb2->dev = dev;
-	skb2->iif = skb->dev->ifindex;
+	skb2->skb_iif = skb->dev->ifindex;
 	dev_queue_xmit(skb2);
 	err = 0;
 
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 9402a7fd3785..e054c62857e1 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -171,7 +171,7 @@ static u32 flow_get_proto_dst(const struct sk_buff *skb)
 
 static u32 flow_get_iif(const struct sk_buff *skb)
 {
-	return skb->iif;
+	return skb->skb_iif;
 }
 
 static u32 flow_get_priority(const struct sk_buff *skb)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index bb230d5d7085..83a4aada0b4c 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4085,7 +4085,7 @@ static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb,
 	char *addrp;
 
 	COMMON_AUDIT_DATA_INIT(&ad, NET);
-	ad.u.net.netif = skb->iif;
+	ad.u.net.netif = skb->skb_iif;
 	ad.u.net.family = family;
 	err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL);
 	if (err)
@@ -4147,7 +4147,7 @@ static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		return 0;
 
 	COMMON_AUDIT_DATA_INIT(&ad, NET);
-	ad.u.net.netif = skb->iif;
+	ad.u.net.netif = skb->skb_iif;
 	ad.u.net.family = family;
 	err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL);
 	if (err)
@@ -4159,7 +4159,7 @@ static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		err = selinux_skb_peerlbl_sid(skb, family, &peer_sid);
 		if (err)
 			return err;
-		err = selinux_inet_sys_rcv_skb(skb->iif, addrp, family,
+		err = selinux_inet_sys_rcv_skb(skb->skb_iif, addrp, family,
 					       peer_sid, &ad);
 		if (err) {
 			selinux_netlbl_err(skb, err, 0);
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index c33b6bb9b6dd..529c9ca65878 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -2602,7 +2602,7 @@ static int smack_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
 #ifdef CONFIG_AUDIT
 	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NET);
 	ad.a.u.net.family = sk->sk_family;
-	ad.a.u.net.netif = skb->iif;
+	ad.a.u.net.netif = skb->skb_iif;
 	ipv4_skb_to_auditdata(skb, &ad.a, NULL);
 #endif
 	/*
@@ -2757,7 +2757,7 @@ static int smack_inet_conn_request(struct sock *sk, struct sk_buff *skb,
 #ifdef CONFIG_AUDIT
 	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NET);
 	ad.a.u.net.family = family;
-	ad.a.u.net.netif = skb->iif;
+	ad.a.u.net.netif = skb->skb_iif;
 	ipv4_skb_to_auditdata(skb, &ad.a, NULL);
 #endif
 	/*
-- 
cgit v1.2.3


From 453f19eea7dbad837425e9b07d84568d14898794 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 20 Nov 2009 22:19:43 +0100
Subject: perf: Allow for custom overflow handlers

in-kernel perf users might wish to have custom actions on the
sample interrupt.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20091120212508.222339539@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h | 6 ++++++
 kernel/perf_event.c        | 8 +++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b5cdac0de370..a430ac3074af 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -567,6 +567,8 @@ struct perf_pending_entry {
 
 typedef void (*perf_callback_t)(struct perf_event *, void *);
 
+struct perf_sample_data;
+
 /**
  * struct perf_event - performance event kernel representation:
  */
@@ -658,6 +660,10 @@ struct perf_event {
 	struct pid_namespace		*ns;
 	u64				id;
 
+	void (*overflow_handler)(struct perf_event *event,
+			int nmi, struct perf_sample_data *data,
+			struct pt_regs *regs);
+
 #ifdef CONFIG_EVENT_PROFILE
 	struct event_filter		*filter;
 #endif
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 3852e2656bb0..1dfb6cc4fdea 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3710,7 +3710,11 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
 			perf_event_disable(event);
 	}
 
-	perf_event_output(event, nmi, data, regs);
+	if (event->overflow_handler)
+		event->overflow_handler(event, nmi, data, regs);
+	else
+		perf_event_output(event, nmi, data, regs);
+
 	return ret;
 }
 
@@ -4836,6 +4840,8 @@ inherit_event(struct perf_event *parent_event,
 	if (parent_event->attr.freq)
 		child_event->hw.sample_period = parent_event->hw.sample_period;
 
+	child_event->overflow_handler = parent_event->overflow_handler;
+
 	/*
 	 * Link it up in the child's context:
 	 */
-- 
cgit v1.2.3


From 59ed446f792cc07d37b1536b9c4664d14e25e425 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 20 Nov 2009 22:19:55 +0100
Subject: perf: Fix event scaling for inherited counters

Properly account the full hierarchy of counters for both the
count (we already did so) and the scale times (new).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20091120212509.153379276@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h |  3 ++-
 kernel/perf_event.c        | 48 +++++++++++++++++++++++++---------------------
 2 files changed, 28 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a430ac3074af..36fe89f72641 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -782,7 +782,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr,
 				int cpu,
 				pid_t pid,
 				perf_callback_t callback);
-extern u64 perf_event_read_value(struct perf_event *event);
+extern u64 perf_event_read_value(struct perf_event *event,
+				 u64 *enabled, u64 *running);
 
 struct perf_sample_data {
 	u64				type;
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index fdfae888a67c..80f40da9a01e 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1774,14 +1774,25 @@ static int perf_event_read_size(struct perf_event *event)
 	return size;
 }
 
-u64 perf_event_read_value(struct perf_event *event)
+u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
 {
 	struct perf_event *child;
 	u64 total = 0;
 
+	*enabled = 0;
+	*running = 0;
+
 	total += perf_event_read(event);
-	list_for_each_entry(child, &event->child_list, child_list)
+	*enabled += event->total_time_enabled +
+			atomic64_read(&event->child_total_time_enabled);
+	*running += event->total_time_running +
+			atomic64_read(&event->child_total_time_running);
+
+	list_for_each_entry(child, &event->child_list, child_list) {
 		total += perf_event_read(child);
+		*enabled += child->total_time_enabled;
+		*running += child->total_time_running;
+	}
 
 	return total;
 }
@@ -1793,19 +1804,15 @@ static int perf_event_read_group(struct perf_event *event,
 	struct perf_event *leader = event->group_leader, *sub;
 	int n = 0, size = 0, ret = 0;
 	u64 values[5];
-	u64 count;
+	u64 count, enabled, running;
 
-	count = perf_event_read_value(leader);
+	count = perf_event_read_value(leader, &enabled, &running);
 
 	values[n++] = 1 + leader->nr_siblings;
-	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-		values[n++] = leader->total_time_enabled +
-			atomic64_read(&leader->child_total_time_enabled);
-	}
-	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-		values[n++] = leader->total_time_running +
-			atomic64_read(&leader->child_total_time_running);
-	}
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		values[n++] = enabled;
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		values[n++] = running;
 	values[n++] = count;
 	if (read_format & PERF_FORMAT_ID)
 		values[n++] = primary_event_id(leader);
@@ -1820,7 +1827,7 @@ static int perf_event_read_group(struct perf_event *event,
 	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
 		n = 0;
 
-		values[n++] = perf_event_read_value(sub);
+		values[n++] = perf_event_read_value(sub, &enabled, &running);
 		if (read_format & PERF_FORMAT_ID)
 			values[n++] = primary_event_id(sub);
 
@@ -1838,18 +1845,15 @@ static int perf_event_read_group(struct perf_event *event,
 static int perf_event_read_one(struct perf_event *event,
 				 u64 read_format, char __user *buf)
 {
+	u64 enabled, running;
 	u64 values[4];
 	int n = 0;
 
-	values[n++] = perf_event_read_value(event);
-	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-		values[n++] = event->total_time_enabled +
-			atomic64_read(&event->child_total_time_enabled);
-	}
-	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-		values[n++] = event->total_time_running +
-			atomic64_read(&event->child_total_time_running);
-	}
+	values[n++] = perf_event_read_value(event, &enabled, &running);
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		values[n++] = enabled;
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		values[n++] = running;
 	if (read_format & PERF_FORMAT_ID)
 		values[n++] = primary_event_id(event);
 
-- 
cgit v1.2.3


From ce71b9df8893ec954e56c5979df6da274f20f65e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 22 Nov 2009 05:26:55 +0100
Subject: tracing: Use the perf recursion protection from trace event

When we commit a trace to perf, we first check if we are
recursing in the same buffer so that we don't mess-up the buffer
with a recursing trace. But later on, we do the same check from
perf to avoid commit recursion. The recursion check is desired
early before we touch the buffer but we want to do this check
only once.

Then export the recursion protection from perf and use it from
the trace events before submitting a trace.

v2: Put appropriate Reported-by tag

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Jason Baron <jbaron@redhat.com>
LKML-Reference: <1258864015-10579-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace_event.h       |  9 ++---
 include/linux/perf_event.h         |  4 +++
 include/trace/ftrace.h             | 23 +++++++------
 kernel/perf_event.c                | 68 +++++++++++++++++++++++++-------------
 kernel/trace/trace_event_profile.c | 14 ++++----
 kernel/trace/trace_kprobe.c        | 48 ++++++++++-----------------
 kernel/trace/trace_syscalls.c      | 47 ++++++++++----------------
 7 files changed, 106 insertions(+), 107 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 43360c1d8f70..47bbdf9c38d0 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -137,13 +137,8 @@ struct ftrace_event_call {
 
 #define FTRACE_MAX_PROFILE_SIZE	2048
 
-struct perf_trace_buf {
-	char	buf[FTRACE_MAX_PROFILE_SIZE];
-	int	recursion;
-};
-
-extern struct perf_trace_buf	*perf_trace_buf;
-extern struct perf_trace_buf	*perf_trace_buf_nmi;
+extern char *perf_trace_buf;
+extern char *perf_trace_buf_nmi;
 
 #define MAX_FILTER_PRED		32
 #define MAX_FILTER_STR_VAL	256	/* Should handle KSYM_SYMBOL_LEN */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 36fe89f72641..74e98b1d3391 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -874,6 +874,8 @@ extern int perf_output_begin(struct perf_output_handle *handle,
 extern void perf_output_end(struct perf_output_handle *handle);
 extern void perf_output_copy(struct perf_output_handle *handle,
 			     const void *buf, unsigned int len);
+extern int perf_swevent_get_recursion_context(int **recursion);
+extern void perf_swevent_put_recursion_context(int *recursion);
 #else
 static inline void
 perf_event_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -902,6 +904,8 @@ static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
 static inline void perf_event_comm(struct task_struct *tsk)		{ }
 static inline void perf_event_fork(struct task_struct *tsk)		{ }
 static inline void perf_event_init(void)				{ }
+static int perf_swevent_get_recursion_context(int **recursion)	{ return -1; }
+static void perf_swevent_put_recursion_context(int *recursion)		{ }
 
 #endif
 
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 4945d1c99864..c222ef5238bf 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -724,16 +724,19 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 static void ftrace_profile_##call(proto)				\
 {									\
 	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
+	extern int perf_swevent_get_recursion_context(int **recursion); \
+	extern void perf_swevent_put_recursion_context(int *recursion); \
 	struct ftrace_event_call *event_call = &event_##call;		\
 	extern void perf_tp_event(int, u64, u64, void *, int);		\
 	struct ftrace_raw_##call *entry;				\
-	struct perf_trace_buf *trace_buf;				\
 	u64 __addr = 0, __count = 1;					\
 	unsigned long irq_flags;					\
 	struct trace_entry *ent;					\
 	int __entry_size;						\
 	int __data_size;						\
+	char *trace_buf;						\
 	char *raw_data;							\
+	int *recursion;							\
 	int __cpu;							\
 	int pc;								\
 									\
@@ -749,6 +752,10 @@ static void ftrace_profile_##call(proto)				\
 		return;							\
 									\
 	local_irq_save(irq_flags);					\
+									\
+	if (perf_swevent_get_recursion_context(&recursion))		\
+		goto end_recursion;						\
+									\
 	__cpu = smp_processor_id();					\
 									\
 	if (in_nmi())							\
@@ -759,13 +766,7 @@ static void ftrace_profile_##call(proto)				\
 	if (!trace_buf)							\
 		goto end;						\
 									\
-	trace_buf = per_cpu_ptr(trace_buf, __cpu);			\
-	if (trace_buf->recursion++)					\
-		goto end_recursion;					\
-									\
-	barrier();							\
-									\
-	raw_data = trace_buf->buf;					\
+	raw_data = per_cpu_ptr(trace_buf, __cpu);			\
 									\
 	*(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;		\
 	entry = (struct ftrace_raw_##call *)raw_data;			\
@@ -780,9 +781,9 @@ static void ftrace_profile_##call(proto)				\
 	perf_tp_event(event_call->id, __addr, __count, entry,		\
 			     __entry_size);				\
 									\
-end_recursion:								\
-	trace_buf->recursion--;						\
-end:									\
+end:								\
+	perf_swevent_put_recursion_context(recursion);			\
+end_recursion:									\
 	local_irq_restore(irq_flags);					\
 									\
 }
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 718fa939b1a7..aba822722300 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3880,34 +3880,42 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx,
 	}
 }
 
-static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx)
+/*
+ * Must be called with preemption disabled
+ */
+int perf_swevent_get_recursion_context(int **recursion)
 {
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+
 	if (in_nmi())
-		return &cpuctx->recursion[3];
+		*recursion = &cpuctx->recursion[3];
+	else if (in_irq())
+		*recursion = &cpuctx->recursion[2];
+	else if (in_softirq())
+		*recursion = &cpuctx->recursion[1];
+	else
+		*recursion = &cpuctx->recursion[0];
 
-	if (in_irq())
-		return &cpuctx->recursion[2];
+	if (**recursion)
+		return -1;
 
-	if (in_softirq())
-		return &cpuctx->recursion[1];
+	(**recursion)++;
 
-	return &cpuctx->recursion[0];
+	return 0;
 }
 
-static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
-				    u64 nr, int nmi,
-				    struct perf_sample_data *data,
-				    struct pt_regs *regs)
+void perf_swevent_put_recursion_context(int *recursion)
 {
-	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
-	int *recursion = perf_swevent_recursion_context(cpuctx);
-	struct perf_event_context *ctx;
-
-	if (*recursion)
-		goto out;
+	(*recursion)--;
+}
 
-	(*recursion)++;
-	barrier();
+static void __do_perf_sw_event(enum perf_type_id type, u32 event_id,
+			       u64 nr, int nmi,
+			       struct perf_sample_data *data,
+			       struct pt_regs *regs)
+{
+	struct perf_event_context *ctx;
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 
 	rcu_read_lock();
 	perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
@@ -3920,12 +3928,25 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
 	if (ctx)
 		perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
 	rcu_read_unlock();
+}
 
-	barrier();
-	(*recursion)--;
+static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
+				    u64 nr, int nmi,
+				    struct perf_sample_data *data,
+				    struct pt_regs *regs)
+{
+	int *recursion;
+
+	preempt_disable();
+
+	if (perf_swevent_get_recursion_context(&recursion))
+		goto out;
+
+	__do_perf_sw_event(type, event_id, nr, nmi, data, regs);
 
+	perf_swevent_put_recursion_context(recursion);
 out:
-	put_cpu_var(perf_cpu_context);
+	preempt_enable();
 }
 
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
@@ -4159,7 +4180,8 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
 	if (!regs)
 		regs = task_pt_regs(current);
 
-	do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
+	/* Trace events already protected against recursion */
+	__do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
 				&data, regs);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index e0d351b01f5a..d9c60f80aa0d 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -9,31 +9,33 @@
 #include "trace.h"
 
 
-struct perf_trace_buf *perf_trace_buf;
+char *perf_trace_buf;
 EXPORT_SYMBOL_GPL(perf_trace_buf);
 
-struct perf_trace_buf *perf_trace_buf_nmi;
+char *perf_trace_buf_nmi;
 EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
 
+typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
+
 /* Count the events in use (per event id, not per instance) */
 static int	total_profile_count;
 
 static int ftrace_profile_enable_event(struct ftrace_event_call *event)
 {
-	struct perf_trace_buf *buf;
+	char *buf;
 	int ret = -ENOMEM;
 
 	if (atomic_inc_return(&event->profile_count))
 		return 0;
 
 	if (!total_profile_count) {
-		buf = alloc_percpu(struct perf_trace_buf);
+		buf = (char *)alloc_percpu(perf_trace_t);
 		if (!buf)
 			goto fail_buf;
 
 		rcu_assign_pointer(perf_trace_buf, buf);
 
-		buf = alloc_percpu(struct perf_trace_buf);
+		buf = (char *)alloc_percpu(perf_trace_t);
 		if (!buf)
 			goto fail_buf_nmi;
 
@@ -79,7 +81,7 @@ int ftrace_profile_enable(int event_id)
 
 static void ftrace_profile_disable_event(struct ftrace_event_call *event)
 {
-	struct perf_trace_buf *buf, *nmi_buf;
+	char *buf, *nmi_buf;
 
 	if (!atomic_add_negative(-1, &event->profile_count))
 		return;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 3696476f307d..22e6f68b05b3 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1208,11 +1208,12 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
 	struct ftrace_event_call *call = &tp->call;
 	struct kprobe_trace_entry *entry;
-	struct perf_trace_buf *trace_buf;
 	struct trace_entry *ent;
 	int size, __size, i, pc, __cpu;
 	unsigned long irq_flags;
+	char *trace_buf;
 	char *raw_data;
+	int *recursion;
 
 	pc = preempt_count();
 	__size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
@@ -1227,6 +1228,10 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 	 * This also protects the rcu read side
 	 */
 	local_irq_save(irq_flags);
+
+	if (perf_swevent_get_recursion_context(&recursion))
+		goto end_recursion;
+
 	__cpu = smp_processor_id();
 
 	if (in_nmi())
@@ -1237,18 +1242,7 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 	if (!trace_buf)
 		goto end;
 
-	trace_buf = per_cpu_ptr(trace_buf, __cpu);
-
-	if (trace_buf->recursion++)
-		goto end_recursion;
-
-	/*
-	 * Make recursion update visible before entering perf_tp_event
-	 * so that we protect from perf recursions.
-	 */
-	barrier();
-
-	raw_data = trace_buf->buf;
+	raw_data = per_cpu_ptr(trace_buf, __cpu);
 
 	/* Zero dead bytes from alignment to avoid buffer leak to userspace */
 	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -1263,9 +1257,9 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
 	perf_tp_event(call->id, entry->ip, 1, entry, size);
 
-end_recursion:
-	trace_buf->recursion--;
 end:
+	perf_swevent_put_recursion_context(recursion);
+end_recursion:
 	local_irq_restore(irq_flags);
 
 	return 0;
@@ -1278,10 +1272,11 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
 	struct ftrace_event_call *call = &tp->call;
 	struct kretprobe_trace_entry *entry;
-	struct perf_trace_buf *trace_buf;
 	struct trace_entry *ent;
 	int size, __size, i, pc, __cpu;
 	unsigned long irq_flags;
+	char *trace_buf;
+	int *recursion;
 	char *raw_data;
 
 	pc = preempt_count();
@@ -1297,6 +1292,10 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 	 * This also protects the rcu read side
 	 */
 	local_irq_save(irq_flags);
+
+	if (perf_swevent_get_recursion_context(&recursion))
+		goto end_recursion;
+
 	__cpu = smp_processor_id();
 
 	if (in_nmi())
@@ -1307,18 +1306,7 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 	if (!trace_buf)
 		goto end;
 
-	trace_buf = per_cpu_ptr(trace_buf, __cpu);
-
-	if (trace_buf->recursion++)
-		goto end_recursion;
-
-	/*
-	 * Make recursion update visible before entering perf_tp_event
-	 * so that we protect from perf recursions.
-	 */
-	barrier();
-
-	raw_data = trace_buf->buf;
+	raw_data = per_cpu_ptr(trace_buf, __cpu);
 
 	/* Zero dead bytes from alignment to avoid buffer leak to userspace */
 	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -1334,9 +1322,9 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
 	perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
 
-end_recursion:
-	trace_buf->recursion--;
 end:
+	perf_swevent_put_recursion_context(recursion);
+end_recursion:
 	local_irq_restore(irq_flags);
 
 	return 0;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 51213b0aa81b..0bb934875263 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -477,10 +477,11 @@ static int sys_prof_refcount_exit;
 static void prof_syscall_enter(struct pt_regs *regs, long id)
 {
 	struct syscall_metadata *sys_data;
-	struct perf_trace_buf *trace_buf;
 	struct syscall_trace_enter *rec;
 	unsigned long flags;
+	char *trace_buf;
 	char *raw_data;
+	int *recursion;
 	int syscall_nr;
 	int size;
 	int cpu;
@@ -505,6 +506,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 	/* Protect the per cpu buffer, begin the rcu read side */
 	local_irq_save(flags);
 
+	if (perf_swevent_get_recursion_context(&recursion))
+		goto end_recursion;
+
 	cpu = smp_processor_id();
 
 	if (in_nmi())
@@ -515,18 +519,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 	if (!trace_buf)
 		goto end;
 
-	trace_buf = per_cpu_ptr(trace_buf, cpu);
-
-	if (trace_buf->recursion++)
-		goto end_recursion;
-
-	/*
-	 * Make recursion update visible before entering perf_tp_event
-	 * so that we protect from perf recursions.
-	 */
-	barrier();
-
-	raw_data = trace_buf->buf;
+	raw_data = per_cpu_ptr(trace_buf, cpu);
 
 	/* zero the dead bytes from align to not leak stack to user */
 	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -539,9 +532,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 			       (unsigned long *)&rec->args);
 	perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
 
-end_recursion:
-	trace_buf->recursion--;
 end:
+	perf_swevent_put_recursion_context(recursion);
+end_recursion:
 	local_irq_restore(flags);
 }
 
@@ -588,10 +581,11 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_exit *rec;
-	struct perf_trace_buf *trace_buf;
 	unsigned long flags;
 	int syscall_nr;
+	char *trace_buf;
 	char *raw_data;
+	int *recursion;
 	int size;
 	int cpu;
 
@@ -617,6 +611,10 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 
 	/* Protect the per cpu buffer, begin the rcu read side */
 	local_irq_save(flags);
+
+	if (perf_swevent_get_recursion_context(&recursion))
+		goto end_recursion;
+
 	cpu = smp_processor_id();
 
 	if (in_nmi())
@@ -627,18 +625,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 	if (!trace_buf)
 		goto end;
 
-	trace_buf = per_cpu_ptr(trace_buf, cpu);
-
-	if (trace_buf->recursion++)
-		goto end_recursion;
-
-	/*
-	 * Make recursion update visible before entering perf_tp_event
-	 * so that we protect from perf recursions.
-	 */
-	barrier();
-
-	raw_data = trace_buf->buf;
+	raw_data = per_cpu_ptr(trace_buf, cpu);
 
 	/* zero the dead bytes from align to not leak stack to user */
 	*(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -652,9 +639,9 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 
 	perf_tp_event(sys_data->exit_id, 0, 1, rec, size);
 
-end_recursion:
-	trace_buf->recursion--;
 end:
+	perf_swevent_put_recursion_context(recursion);
+end_recursion:
 	local_irq_restore(flags);
 }
 
-- 
cgit v1.2.3


From 5093ebad5f2348076fdc3dac7d2358b1ad7f85f7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 22 Nov 2009 05:21:35 +0100
Subject: hw-breakpoints: Separate the kernel part from breakpoint headers

So that we can include this header from userspace tools, like
perf tools, to get the breakpoint types and len definitions.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Prasad <prasad@linux.vnet.ibm.com>
LKML-Reference: <1258863695-10464-4-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/hw_breakpoint.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index 0b98cbf76da7..4659e0c55ea6 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -16,6 +16,7 @@ enum {
 	HW_BREAKPOINT_X = 4,
 };
 
+#ifdef __KERNEL__
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 
 static inline unsigned long hw_breakpoint_addr(struct perf_event *bp)
@@ -133,5 +134,6 @@ static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp)
 }
 
 #endif /* CONFIG_HAVE_HW_BREAKPOINT */
+#endif /* __KERNEL__ */
 
 #endif /* _LINUX_HW_BREAKPOINT_H */
-- 
cgit v1.2.3


From 9f680ab41485edfdc96331b70afa7513aa0a7720 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 22 Nov 2009 08:53:49 -0800
Subject: rcu: Eliminate unneeded function wrapping

The functions rcu_init() is a wrapper for __rcu_init(), and also
sets up the CPU-hotplug notifier for rcu_barrier_cpu_hotplug().
But TINY_RCU doesn't need CPU-hotplug notification, and the
rcu_barrier_cpu_hotplug() is a simple wrapper for
rcu_cpu_notify().

So push rcu_init() out to kernel/rcutree.c and kernel/rcutiny.c
and get rid of the wrapper function rcu_barrier_cpu_hotplug().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12589088302320-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcutiny.h |  2 --
 include/linux/rcutree.h |  3 ---
 kernel/rcupdate.c       | 22 ----------------------
 kernel/rcutiny.c        | 11 +----------
 kernel/rcutree.c        | 17 ++++++++++++++---
 5 files changed, 15 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 2c1fe8373e71..a3b6272af2dd 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -38,7 +38,6 @@ void rcu_bh_qs(int cpu);
 
 #define rcu_init_sched()	do { } while (0)
 extern void rcu_check_callbacks(int cpu, int user);
-extern void __rcu_init(void);
 
 /*
  * Return the number of grace periods.
@@ -69,7 +68,6 @@ static inline void synchronize_rcu_bh_expedited(void)
 }
 
 struct notifier_block;
-extern int rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu);
 
 #ifdef CONFIG_NO_HZ
 
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 9642c6bcb399..111a65257350 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -34,8 +34,6 @@ struct notifier_block;
 
 extern void rcu_sched_qs(int cpu);
 extern void rcu_bh_qs(int cpu);
-extern int rcu_cpu_notify(struct notifier_block *self,
-			  unsigned long action, void *hcpu);
 extern int rcu_needs_cpu(int cpu);
 extern int rcu_expedited_torture_stats(char *page);
 
@@ -83,7 +81,6 @@ static inline void synchronize_rcu_bh_expedited(void)
 	synchronize_sched_expedited();
 }
 
-extern void __rcu_init(void);
 extern void rcu_check_callbacks(int cpu, int user);
 
 extern long rcu_batches_completed(void);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 7625f207f65e..eb6b534db318 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -161,28 +161,6 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
 
 #endif /* #ifndef CONFIG_TINY_RCU */
 
-static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
-		unsigned long action, void *hcpu)
-{
-	return rcu_cpu_notify(self, action, hcpu);
-}
-
-void __init rcu_init(void)
-{
-	int i;
-
-	__rcu_init();
-	cpu_notifier(rcu_barrier_cpu_hotplug, 0);
-
-	/*
-	 * We don't need protection against CPU-hotplug here because
-	 * this is called early in boot, before either interrupts
-	 * or the scheduler are operational.
-	 */
-	for_each_online_cpu(i)
-		rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i);
-}
-
 void rcu_scheduler_starting(void)
 {
 	WARN_ON(num_online_cpus() != 1);
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index b33ec3aa377a..9f6d9ff2572c 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -177,15 +177,6 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 	__rcu_process_callbacks(&rcu_bh_ctrlblk);
 }
 
-/*
- * Null function to handle CPU being onlined.  Longer term, we want to
- * make TINY_RCU avoid using rcupdate.c, but later...
- */
-int rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
-{
-	return NOTIFY_OK;
-}
-
 /*
  * Wait for a grace period to elapse.  But it is illegal to invoke
  * synchronize_sched() from within an RCU read-side critical section.
@@ -285,7 +276,7 @@ void rcu_barrier_sched(void)
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
 
-void __rcu_init(void)
+void __init rcu_init(void)
 {
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index b79bfcd28e95..e3d3bbddbcd5 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1644,8 +1644,8 @@ static void __cpuinit rcu_online_cpu(int cpu)
 /*
  * Handle CPU online/offline notification events.
  */
-int __cpuinit rcu_cpu_notify(struct notifier_block *self,
-			     unsigned long action, void *hcpu)
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+				    unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
 
@@ -1781,8 +1781,10 @@ do { \
 	} \
 } while (0)
 
-void __init __rcu_init(void)
+void __init rcu_init(void)
 {
+	int i;
+
 	rcu_bootup_announce();
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 	printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
@@ -1791,6 +1793,15 @@ void __init __rcu_init(void)
 	RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
 	__rcu_init_preempt();
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+
+	/*
+	 * We don't need protection against CPU-hotplug here because
+	 * this is called early in boot, before either interrupts
+	 * or the scheduler are operational.
+	 */
+	cpu_notifier(rcu_cpu_notify, 0);
+	for_each_online_cpu(i)
+		rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i);
 }
 
 #include "rcutree_plugin.h"
-- 
cgit v1.2.3


From 6ebb237bece23275d1da149b61a342f0d4d06a08 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 22 Nov 2009 08:53:50 -0800
Subject: rcu: Re-arrange code to reduce #ifdef pain

Remove #ifdefs from kernel/rcupdate.c and
include/linux/rcupdate.h by moving code to
include/linux/rcutiny.h, include/linux/rcutree.h, and
kernel/rcutree.c.

Also remove some definitions that are no longer used.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <1258908830885-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcupdate.h |  12 ------
 include/linux/rcutiny.h  |  11 +++++
 include/linux/rcutree.h  |   4 +-
 kernel/rcupdate.c        | 104 -----------------------------------------------
 kernel/rcutree.c         |  80 ++++++++++++++++++++++++++++++++++++
 kernel/rcutree_plugin.h  |  24 +++++++++++
 6 files changed, 118 insertions(+), 117 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 2f1bc42a3b82..24440f4bf476 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -52,11 +52,6 @@ struct rcu_head {
 };
 
 /* Exported common interfaces */
-#ifdef CONFIG_TREE_PREEMPT_RCU
-extern void synchronize_rcu(void);
-#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-#define synchronize_rcu synchronize_sched
-#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
 extern void synchronize_rcu_bh(void);
 extern void synchronize_sched(void);
 extern void rcu_barrier(void);
@@ -67,13 +62,6 @@ extern int sched_expedited_torture_stats(char *page);
 
 /* Internal to kernel */
 extern void rcu_init(void);
-extern void rcu_scheduler_starting(void);
-#ifndef CONFIG_TINY_RCU
-extern int rcu_needs_cpu(int cpu);
-#else
-static inline int rcu_needs_cpu(int cpu) { return 0; }
-#endif
-extern int rcu_scheduler_active;
 
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
 #include <linux/rcutree.h>
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index a3b6272af2dd..c4ba9a78721e 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -39,6 +39,11 @@ void rcu_bh_qs(int cpu);
 #define rcu_init_sched()	do { } while (0)
 extern void rcu_check_callbacks(int cpu, int user);
 
+static inline int rcu_needs_cpu(int cpu)
+{
+	return 0;
+}
+
 /*
  * Return the number of grace periods.
  */
@@ -57,6 +62,8 @@ static inline long rcu_batches_completed_bh(void)
 
 extern int rcu_expedited_torture_stats(char *page);
 
+#define synchronize_rcu synchronize_sched
+
 static inline void synchronize_rcu_expedited(void)
 {
 	synchronize_sched();
@@ -86,6 +93,10 @@ static inline void rcu_exit_nohz(void)
 
 #endif /* #else #ifdef CONFIG_NO_HZ */
 
+static inline void rcu_scheduler_starting(void)
+{
+}
+
 static inline void exit_rcu(void)
 {
 }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 111a65257350..c93eee5911b0 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -35,12 +35,14 @@ struct notifier_block;
 extern void rcu_sched_qs(int cpu);
 extern void rcu_bh_qs(int cpu);
 extern int rcu_needs_cpu(int cpu);
+extern void rcu_scheduler_starting(void);
 extern int rcu_expedited_torture_stats(char *page);
 
 #ifdef CONFIG_TREE_PREEMPT_RCU
 
 extern void __rcu_read_lock(void);
 extern void __rcu_read_unlock(void);
+extern void synchronize_rcu(void);
 extern void exit_rcu(void);
 
 #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
@@ -55,7 +57,7 @@ static inline void __rcu_read_unlock(void)
 	preempt_enable();
 }
 
-#define __synchronize_sched() synchronize_rcu()
+#define synchronize_rcu synchronize_sched
 
 static inline void exit_rcu(void)
 {
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index eb6b534db318..9b7fd4723878 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,7 +44,6 @@
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 #include <linux/module.h>
-#include <linux/kernel_stat.h>
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key rcu_lock_key;
@@ -53,8 +52,6 @@ struct lockdep_map rcu_lock_map =
 EXPORT_SYMBOL_GPL(rcu_lock_map);
 #endif
 
-int rcu_scheduler_active __read_mostly;
-
 /*
  * Awaken the corresponding synchronize_rcu() instance now that a
  * grace period has elapsed.
@@ -66,104 +63,3 @@ void wakeme_after_rcu(struct rcu_head  *head)
 	rcu = container_of(head, struct rcu_synchronize, head);
 	complete(&rcu->completion);
 }
-
-#ifndef CONFIG_TINY_RCU
-
-#ifdef CONFIG_TREE_PREEMPT_RCU
-
-/**
- * synchronize_rcu - wait until a grace period has elapsed.
- *
- * Control will return to the caller some time after a full grace
- * period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed.  RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
- */
-void synchronize_rcu(void)
-{
-	struct rcu_synchronize rcu;
-
-	if (!rcu_scheduler_active)
-		return;
-
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished. */
-	call_rcu(&rcu.head, wakeme_after_rcu);
-	/* Wait for it. */
-	wait_for_completion(&rcu.completion);
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu);
-
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-
-/**
- * synchronize_sched - wait until an rcu-sched grace period has elapsed.
- *
- * Control will return to the caller some time after a full rcu-sched
- * grace period has elapsed, in other words after all currently executing
- * rcu-sched read-side critical sections have completed.   These read-side
- * critical sections are delimited by rcu_read_lock_sched() and
- * rcu_read_unlock_sched(), and may be nested.  Note that preempt_disable(),
- * local_irq_disable(), and so on may be used in place of
- * rcu_read_lock_sched().
- *
- * This means that all preempt_disable code sequences, including NMI and
- * hardware-interrupt handlers, in progress on entry will have completed
- * before this primitive returns.  However, this does not guarantee that
- * softirq handlers will have completed, since in some kernels, these
- * handlers can run in process context, and can block.
- *
- * This primitive provides the guarantees made by the (now removed)
- * synchronize_kernel() API.  In contrast, synchronize_rcu() only
- * guarantees that rcu_read_lock() sections will have completed.
- * In "classic RCU", these two guarantees happen to be one and
- * the same, but can differ in realtime RCU implementations.
- */
-void synchronize_sched(void)
-{
-	struct rcu_synchronize rcu;
-
-	if (rcu_blocking_is_gp())
-		return;
-
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished. */
-	call_rcu_sched(&rcu.head, wakeme_after_rcu);
-	/* Wait for it. */
-	wait_for_completion(&rcu.completion);
-}
-EXPORT_SYMBOL_GPL(synchronize_sched);
-
-/**
- * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
- *
- * Control will return to the caller some time after a full rcu_bh grace
- * period has elapsed, in other words after all currently executing rcu_bh
- * read-side critical sections have completed.  RCU read-side critical
- * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
- * and may be nested.
- */
-void synchronize_rcu_bh(void)
-{
-	struct rcu_synchronize rcu;
-
-	if (rcu_blocking_is_gp())
-		return;
-
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished. */
-	call_rcu_bh(&rcu.head, wakeme_after_rcu);
-	/* Wait for it. */
-	wait_for_completion(&rcu.completion);
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
-
-#endif /* #ifndef CONFIG_TINY_RCU */
-
-void rcu_scheduler_starting(void)
-{
-	WARN_ON(num_online_cpus() != 1);
-	WARN_ON(nr_context_switches() > 0);
-	rcu_scheduler_active = 1;
-}
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e3d3bbddbcd5..4ca7e0292fd8 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,6 +46,7 @@
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 #include <linux/time.h>
+#include <linux/kernel_stat.h>
 
 #include "rcutree.h"
 
@@ -79,6 +80,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
 struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 
+static int rcu_scheduler_active __read_mostly;
+
 
 /*
  * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
@@ -1396,6 +1399,68 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
 
+/**
+ * synchronize_sched - wait until an rcu-sched grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full rcu-sched
+ * grace period has elapsed, in other words after all currently executing
+ * rcu-sched read-side critical sections have completed.   These read-side
+ * critical sections are delimited by rcu_read_lock_sched() and
+ * rcu_read_unlock_sched(), and may be nested.  Note that preempt_disable(),
+ * local_irq_disable(), and so on may be used in place of
+ * rcu_read_lock_sched().
+ *
+ * This means that all preempt_disable code sequences, including NMI and
+ * hardware-interrupt handlers, in progress on entry will have completed
+ * before this primitive returns.  However, this does not guarantee that
+ * softirq handlers will have completed, since in some kernels, these
+ * handlers can run in process context, and can block.
+ *
+ * This primitive provides the guarantees made by the (now removed)
+ * synchronize_kernel() API.  In contrast, synchronize_rcu() only
+ * guarantees that rcu_read_lock() sections will have completed.
+ * In "classic RCU", these two guarantees happen to be one and
+ * the same, but can differ in realtime RCU implementations.
+ */
+void synchronize_sched(void)
+{
+	struct rcu_synchronize rcu;
+
+	if (rcu_blocking_is_gp())
+		return;
+
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished. */
+	call_rcu_sched(&rcu.head, wakeme_after_rcu);
+	/* Wait for it. */
+	wait_for_completion(&rcu.completion);
+}
+EXPORT_SYMBOL_GPL(synchronize_sched);
+
+/**
+ * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full rcu_bh grace
+ * period has elapsed, in other words after all currently executing rcu_bh
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
+ * and may be nested.
+ */
+void synchronize_rcu_bh(void)
+{
+	struct rcu_synchronize rcu;
+
+	if (rcu_blocking_is_gp())
+		return;
+
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished. */
+	call_rcu_bh(&rcu.head, wakeme_after_rcu);
+	/* Wait for it. */
+	wait_for_completion(&rcu.completion);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
+
 /*
  * Check to see if there is any immediate RCU-related work to be done
  * by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -1480,6 +1545,21 @@ int rcu_needs_cpu(int cpu)
 	       rcu_preempt_needs_cpu(cpu);
 }
 
+/*
+ * This function is invoked towards the end of the scheduler's initialization
+ * process.  Before this is called, the idle task might contain
+ * RCU read-side critical sections (during which time, this idle
+ * task is booting the system).  After this function is called, the
+ * idle tasks are prohibited from containing RCU read-side critical
+ * sections.
+ */
+void rcu_scheduler_starting(void)
+{
+	WARN_ON(num_online_cpus() != 1);
+	WARN_ON(nr_context_switches() > 0);
+	rcu_scheduler_active = 1;
+}
+
 static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
 static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 0bdb592eee66..1d295c789d3d 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -425,6 +425,30 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu);
 
+/**
+ * synchronize_rcu - wait until a grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+void synchronize_rcu(void)
+{
+	struct rcu_synchronize rcu;
+
+	if (!rcu_scheduler_active)
+		return;
+
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished. */
+	call_rcu(&rcu.head, wakeme_after_rcu);
+	/* Wait for it. */
+	wait_for_completion(&rcu.completion);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu);
+
 /*
  * Wait for an rcu-preempt grace period.  We are supposed to expedite the
  * grace period, but this is the crude slow compatability hack, so just
-- 
cgit v1.2.3


From 3066eec68d21cf4d468809c0b7b1fe9ee59c8f32 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@nokia.com>
Date: Thu, 22 Oct 2009 13:26:45 +0300
Subject: MFD: twl4030: add twl4030_codec MFD as a new child to the core

New MFD child to twl4030 MFD device.

Reason for the twl4030_codec MFD: the vibra control is actually in the codec
part of the twl4030. If both the vibra and the audio functionality is needed
from the twl4030 at the same time, than they need to control the codec power
and APLL at the same time without breaking the other driver.
Also these two has to be able to work without the need for the other driver.

This MFD device will be used by the drivers, which needs resources
from the twl4030 codec like audio and vibra.

The platform specific configuration data is passed along to the
child drivers (audio, vibra).

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@nokia.com>
Acked-by: Samuel Ortiz <sameo@linux.intel.com>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/mfd/Kconfig               |   6 +
 drivers/mfd/Makefile              |   1 +
 drivers/mfd/twl4030-codec.c       | 241 +++++++++++++++++++++++++++++++++
 drivers/mfd/twl4030-core.c        |  14 ++
 include/linux/i2c/twl4030.h       |  18 +++
 include/linux/mfd/twl4030-codec.h | 271 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 551 insertions(+)
 create mode 100644 drivers/mfd/twl4030-codec.c
 create mode 100644 include/linux/mfd/twl4030-codec.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 570be139f9df..08f2d07bf56a 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -121,6 +121,12 @@ config TWL4030_POWER
 	  and load scripts controling which resources are switched off/on
 	  or reset when a sleep, wakeup or warm reset event occurs.
 
+config TWL4030_CODEC
+	bool
+	depends on TWL4030_CORE
+	select MFD_CORE
+	default n
+
 config MFD_TMIO
 	bool
 	default n
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index f3b277b90d40..af0fc903cec8 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_MENELAUS)		+= menelaus.o
 
 obj-$(CONFIG_TWL4030_CORE)	+= twl4030-core.o twl4030-irq.o
 obj-$(CONFIG_TWL4030_POWER)    += twl4030-power.o
+obj-$(CONFIG_TWL4030_CODEC)	+= twl4030-codec.o
 
 obj-$(CONFIG_MFD_MC13783)	+= mc13783-core.o
 
diff --git a/drivers/mfd/twl4030-codec.c b/drivers/mfd/twl4030-codec.c
new file mode 100644
index 000000000000..97103078dc2a
--- /dev/null
+++ b/drivers/mfd/twl4030-codec.c
@@ -0,0 +1,241 @@
+/*
+ * MFD driver for twl4030 codec submodule
+ *
+ * Author:	Peter Ujfalusi <peter.ujfalusi@nokia.com>
+ *
+ * Copyright:   (C) 2009 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/platform_device.h>
+#include <linux/i2c/twl4030.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/twl4030-codec.h>
+
+#define TWL4030_CODEC_CELLS	2
+
+static struct platform_device *twl4030_codec_dev;
+
+struct twl4030_codec_resource {
+	int request_count;
+	u8 reg;
+	u8 mask;
+};
+
+struct twl4030_codec {
+	struct mutex mutex;
+	struct twl4030_codec_resource resource[TWL4030_CODEC_RES_MAX];
+	struct mfd_cell cells[TWL4030_CODEC_CELLS];
+};
+
+/*
+ * Modify the resource, the function returns the content of the register
+ * after the modification.
+ */
+static int twl4030_codec_set_resource(enum twl4030_codec_res id, int enable)
+{
+	struct twl4030_codec *codec = platform_get_drvdata(twl4030_codec_dev);
+	u8 val;
+
+	twl4030_i2c_read_u8(TWL4030_MODULE_AUDIO_VOICE, &val,
+			codec->resource[id].reg);
+
+	if (enable)
+		val |= codec->resource[id].mask;
+	else
+		val &= ~codec->resource[id].mask;
+
+	twl4030_i2c_write_u8(TWL4030_MODULE_AUDIO_VOICE,
+					val, codec->resource[id].reg);
+
+	return val;
+}
+
+static inline int twl4030_codec_get_resource(enum twl4030_codec_res id)
+{
+	struct twl4030_codec *codec = platform_get_drvdata(twl4030_codec_dev);
+	u8 val;
+
+	twl4030_i2c_read_u8(TWL4030_MODULE_AUDIO_VOICE, &val,
+			codec->resource[id].reg);
+
+	return val;
+}
+
+/*
+ * Enable the resource.
+ * The function returns with error or the content of the register
+ */
+int twl4030_codec_enable_resource(enum twl4030_codec_res id)
+{
+	struct twl4030_codec *codec = platform_get_drvdata(twl4030_codec_dev);
+	int val;
+
+	if (id >= TWL4030_CODEC_RES_MAX) {
+		dev_err(&twl4030_codec_dev->dev,
+				"Invalid resource ID (%u)\n", id);
+		return -EINVAL;
+	}
+
+	mutex_lock(&codec->mutex);
+	if (!codec->resource[id].request_count)
+		/* Resource was disabled, enable it */
+		val = twl4030_codec_set_resource(id, 1);
+	else
+		val = twl4030_codec_get_resource(id);
+
+	codec->resource[id].request_count++;
+	mutex_unlock(&codec->mutex);
+
+	return val;
+}
+EXPORT_SYMBOL_GPL(twl4030_codec_enable_resource);
+
+/*
+ * Disable the resource.
+ * The function returns with error or the content of the register
+ */
+int twl4030_codec_disable_resource(unsigned id)
+{
+	struct twl4030_codec *codec = platform_get_drvdata(twl4030_codec_dev);
+	int val;
+
+	if (id >= TWL4030_CODEC_RES_MAX) {
+		dev_err(&twl4030_codec_dev->dev,
+				"Invalid resource ID (%u)\n", id);
+		return -EINVAL;
+	}
+
+	mutex_lock(&codec->mutex);
+	if (!codec->resource[id].request_count) {
+		dev_err(&twl4030_codec_dev->dev,
+			"Resource has been disabled already (%u)\n", id);
+		mutex_unlock(&codec->mutex);
+		return -EPERM;
+	}
+	codec->resource[id].request_count--;
+
+	if (!codec->resource[id].request_count)
+		/* Resource can be disabled now */
+		val = twl4030_codec_set_resource(id, 0);
+	else
+		val = twl4030_codec_get_resource(id);
+
+	mutex_unlock(&codec->mutex);
+
+	return val;
+}
+EXPORT_SYMBOL_GPL(twl4030_codec_disable_resource);
+
+static int __devinit twl4030_codec_probe(struct platform_device *pdev)
+{
+	struct twl4030_codec *codec;
+	struct twl4030_codec_data *pdata = pdev->dev.platform_data;
+	struct mfd_cell *cell = NULL;
+	int ret, childs = 0;
+
+	codec = kzalloc(sizeof(struct twl4030_codec), GFP_KERNEL);
+	if (!codec)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, codec);
+
+	twl4030_codec_dev = pdev;
+	mutex_init(&codec->mutex);
+
+	/* Codec power */
+	codec->resource[TWL4030_CODEC_RES_POWER].reg = TWL4030_REG_CODEC_MODE;
+	codec->resource[TWL4030_CODEC_RES_POWER].mask = TWL4030_CODECPDZ;
+
+	/* PLL */
+	codec->resource[TWL4030_CODEC_RES_APLL].reg = TWL4030_REG_APLL_CTL;
+	codec->resource[TWL4030_CODEC_RES_APLL].mask = TWL4030_APLL_EN;
+
+	if (pdata->audio) {
+		cell = &codec->cells[childs];
+		cell->name = "twl4030_codec_audio";
+		cell->platform_data = pdata->audio;
+		cell->data_size = sizeof(*pdata->audio);
+		childs++;
+	}
+	if (pdata->vibra) {
+		cell = &codec->cells[childs];
+		cell->name = "twl4030_codec_vibra";
+		cell->platform_data = pdata->vibra;
+		cell->data_size = sizeof(*pdata->vibra);
+		childs++;
+	}
+
+	if (childs)
+		ret = mfd_add_devices(&pdev->dev, pdev->id, codec->cells,
+				      childs, NULL, 0);
+	else {
+		dev_err(&pdev->dev, "No platform data found for childs\n");
+		ret = -ENODEV;
+	}
+
+	if (!ret)
+		return 0;
+
+	platform_set_drvdata(pdev, NULL);
+	kfree(codec);
+	twl4030_codec_dev = NULL;
+	return ret;
+}
+
+static int __devexit twl4030_codec_remove(struct platform_device *pdev)
+{
+	struct twl4030_codec *codec = platform_get_drvdata(pdev);
+
+	mfd_remove_devices(&pdev->dev);
+	platform_set_drvdata(pdev, NULL);
+	kfree(codec);
+	twl4030_codec_dev = NULL;
+
+	return 0;
+}
+
+MODULE_ALIAS("platform:twl4030_codec");
+
+static struct platform_driver twl4030_codec_driver = {
+	.probe		= twl4030_codec_probe,
+	.remove		= __devexit_p(twl4030_codec_remove),
+	.driver		= {
+		.owner	= THIS_MODULE,
+		.name	= "twl4030_codec",
+	},
+};
+
+static int __devinit twl4030_codec_init(void)
+{
+	return platform_driver_register(&twl4030_codec_driver);
+}
+module_init(twl4030_codec_init);
+
+static void __devexit twl4030_codec_exit(void)
+{
+	platform_driver_unregister(&twl4030_codec_driver);
+}
+module_exit(twl4030_codec_exit);
+
+MODULE_AUTHOR("Peter Ujfalusi <peter.ujfalusi@nokia.com>");
+MODULE_LICENSE("GPL");
+
diff --git a/drivers/mfd/twl4030-core.c b/drivers/mfd/twl4030-core.c
index 6cb12502e306..cf462b90dee2 100644
--- a/drivers/mfd/twl4030-core.c
+++ b/drivers/mfd/twl4030-core.c
@@ -114,6 +114,12 @@
 #define twl_has_watchdog()        false
 #endif
 
+#if defined(CONFIG_TWL4030_CODEC) || defined(CONFIG_TWL4030_CODEC_MODULE)
+#define twl_has_codec()	true
+#else
+#define twl_has_codec()	false
+#endif
+
 /* Triton Core internal information (BEGIN) */
 
 /* Last - for index max*/
@@ -601,6 +607,14 @@ add_children(struct twl4030_platform_data *pdata, unsigned long features)
 			return PTR_ERR(child);
 	}
 
+	if (twl_has_codec() && pdata->codec) {
+		child = add_child(1, "twl4030_codec",
+				pdata->codec, sizeof(*pdata->codec),
+				false, 0, 0);
+		if (IS_ERR(child))
+			return PTR_ERR(child);
+	}
+
 	if (twl_has_regulator()) {
 		/*
 		child = add_regulator(TWL4030_REG_VPLL1, pdata->vpll1);
diff --git a/include/linux/i2c/twl4030.h b/include/linux/i2c/twl4030.h
index 508824ee35e6..ba61add3e05a 100644
--- a/include/linux/i2c/twl4030.h
+++ b/include/linux/i2c/twl4030.h
@@ -401,6 +401,23 @@ struct twl4030_power_data {
 
 extern void twl4030_power_init(struct twl4030_power_data *triton2_scripts);
 
+struct twl4030_codec_audio_data {
+	unsigned int	audio_mclk;
+	unsigned int ramp_delay_value;
+	unsigned int hs_extmute:1;
+	void (*set_hs_extmute)(int mute);
+};
+
+struct twl4030_codec_vibra_data {
+	unsigned int	audio_mclk;
+	unsigned int	coexist;
+};
+
+struct twl4030_codec_data {
+	struct twl4030_codec_audio_data		*audio;
+	struct twl4030_codec_vibra_data		*vibra;
+};
+
 struct twl4030_platform_data {
 	unsigned				irq_base, irq_end;
 	struct twl4030_bci_platform_data	*bci;
@@ -409,6 +426,7 @@ struct twl4030_platform_data {
 	struct twl4030_keypad_data		*keypad;
 	struct twl4030_usb_data			*usb;
 	struct twl4030_power_data		*power;
+	struct twl4030_codec_data		*codec;
 
 	/* LDO regulators */
 	struct regulator_init_data		*vdac;
diff --git a/include/linux/mfd/twl4030-codec.h b/include/linux/mfd/twl4030-codec.h
new file mode 100644
index 000000000000..ef0a3041d759
--- /dev/null
+++ b/include/linux/mfd/twl4030-codec.h
@@ -0,0 +1,271 @@
+/*
+ * MFD driver for twl4030 codec submodule
+ *
+ * Author:	Peter Ujfalusi <peter.ujfalusi@nokia.com>
+ *
+ * Copyright:   (C) 2009 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ */
+
+#ifndef __TWL4030_CODEC_H__
+#define __TWL4030_CODEC_H__
+
+/* Codec registers */
+#define TWL4030_REG_CODEC_MODE		0x01
+#define TWL4030_REG_OPTION		0x02
+#define TWL4030_REG_UNKNOWN		0x03
+#define TWL4030_REG_MICBIAS_CTL		0x04
+#define TWL4030_REG_ANAMICL		0x05
+#define TWL4030_REG_ANAMICR		0x06
+#define TWL4030_REG_AVADC_CTL		0x07
+#define TWL4030_REG_ADCMICSEL		0x08
+#define TWL4030_REG_DIGMIXING		0x09
+#define TWL4030_REG_ATXL1PGA		0x0A
+#define TWL4030_REG_ATXR1PGA		0x0B
+#define TWL4030_REG_AVTXL2PGA		0x0C
+#define TWL4030_REG_AVTXR2PGA		0x0D
+#define TWL4030_REG_AUDIO_IF		0x0E
+#define TWL4030_REG_VOICE_IF		0x0F
+#define TWL4030_REG_ARXR1PGA		0x10
+#define TWL4030_REG_ARXL1PGA		0x11
+#define TWL4030_REG_ARXR2PGA		0x12
+#define TWL4030_REG_ARXL2PGA		0x13
+#define TWL4030_REG_VRXPGA		0x14
+#define TWL4030_REG_VSTPGA		0x15
+#define TWL4030_REG_VRX2ARXPGA		0x16
+#define TWL4030_REG_AVDAC_CTL		0x17
+#define TWL4030_REG_ARX2VTXPGA		0x18
+#define TWL4030_REG_ARXL1_APGA_CTL	0x19
+#define TWL4030_REG_ARXR1_APGA_CTL	0x1A
+#define TWL4030_REG_ARXL2_APGA_CTL	0x1B
+#define TWL4030_REG_ARXR2_APGA_CTL	0x1C
+#define TWL4030_REG_ATX2ARXPGA		0x1D
+#define TWL4030_REG_BT_IF		0x1E
+#define TWL4030_REG_BTPGA		0x1F
+#define TWL4030_REG_BTSTPGA		0x20
+#define TWL4030_REG_EAR_CTL		0x21
+#define TWL4030_REG_HS_SEL		0x22
+#define TWL4030_REG_HS_GAIN_SET		0x23
+#define TWL4030_REG_HS_POPN_SET		0x24
+#define TWL4030_REG_PREDL_CTL		0x25
+#define TWL4030_REG_PREDR_CTL		0x26
+#define TWL4030_REG_PRECKL_CTL		0x27
+#define TWL4030_REG_PRECKR_CTL		0x28
+#define TWL4030_REG_HFL_CTL		0x29
+#define TWL4030_REG_HFR_CTL		0x2A
+#define TWL4030_REG_ALC_CTL		0x2B
+#define TWL4030_REG_ALC_SET1		0x2C
+#define TWL4030_REG_ALC_SET2		0x2D
+#define TWL4030_REG_BOOST_CTL		0x2E
+#define TWL4030_REG_SOFTVOL_CTL		0x2F
+#define TWL4030_REG_DTMF_FREQSEL	0x30
+#define TWL4030_REG_DTMF_TONEXT1H	0x31
+#define TWL4030_REG_DTMF_TONEXT1L	0x32
+#define TWL4030_REG_DTMF_TONEXT2H	0x33
+#define TWL4030_REG_DTMF_TONEXT2L	0x34
+#define TWL4030_REG_DTMF_TONOFF		0x35
+#define TWL4030_REG_DTMF_WANONOFF	0x36
+#define TWL4030_REG_I2S_RX_SCRAMBLE_H	0x37
+#define TWL4030_REG_I2S_RX_SCRAMBLE_M	0x38
+#define TWL4030_REG_I2S_RX_SCRAMBLE_L	0x39
+#define TWL4030_REG_APLL_CTL		0x3A
+#define TWL4030_REG_DTMF_CTL		0x3B
+#define TWL4030_REG_DTMF_PGA_CTL2	0x3C
+#define TWL4030_REG_DTMF_PGA_CTL1	0x3D
+#define TWL4030_REG_MISC_SET_1		0x3E
+#define TWL4030_REG_PCMBTMUX		0x3F
+#define TWL4030_REG_RX_PATH_SEL		0x43
+#define TWL4030_REG_VDL_APGA_CTL	0x44
+#define TWL4030_REG_VIBRA_CTL		0x45
+#define TWL4030_REG_VIBRA_SET		0x46
+#define TWL4030_REG_VIBRA_PWM_SET	0x47
+#define TWL4030_REG_ANAMIC_GAIN		0x48
+#define TWL4030_REG_MISC_SET_2		0x49
+
+/* Bitfield Definitions */
+
+/* TWL4030_CODEC_MODE (0x01) Fields */
+#define TWL4030_APLL_RATE		0xF0
+#define TWL4030_APLL_RATE_8000		0x00
+#define TWL4030_APLL_RATE_11025		0x10
+#define TWL4030_APLL_RATE_12000		0x20
+#define TWL4030_APLL_RATE_16000		0x40
+#define TWL4030_APLL_RATE_22050		0x50
+#define TWL4030_APLL_RATE_24000		0x60
+#define TWL4030_APLL_RATE_32000		0x80
+#define TWL4030_APLL_RATE_44100		0x90
+#define TWL4030_APLL_RATE_48000		0xA0
+#define TWL4030_APLL_RATE_96000		0xE0
+#define TWL4030_SEL_16K			0x08
+#define TWL4030_CODECPDZ		0x02
+#define TWL4030_OPT_MODE		0x01
+#define TWL4030_OPTION_1		(1 << 0)
+#define TWL4030_OPTION_2		(0 << 0)
+
+/* TWL4030_OPTION (0x02) Fields */
+#define TWL4030_ATXL1_EN		(1 << 0)
+#define TWL4030_ATXR1_EN		(1 << 1)
+#define TWL4030_ATXL2_VTXL_EN		(1 << 2)
+#define TWL4030_ATXR2_VTXR_EN		(1 << 3)
+#define TWL4030_ARXL1_VRX_EN		(1 << 4)
+#define TWL4030_ARXR1_EN		(1 << 5)
+#define TWL4030_ARXL2_EN		(1 << 6)
+#define TWL4030_ARXR2_EN		(1 << 7)
+
+/* TWL4030_REG_MICBIAS_CTL (0x04) Fields */
+#define TWL4030_MICBIAS2_CTL		0x40
+#define TWL4030_MICBIAS1_CTL		0x20
+#define TWL4030_HSMICBIAS_EN		0x04
+#define TWL4030_MICBIAS2_EN		0x02
+#define TWL4030_MICBIAS1_EN		0x01
+
+/* ANAMICL (0x05) Fields */
+#define TWL4030_CNCL_OFFSET_START	0x80
+#define TWL4030_OFFSET_CNCL_SEL		0x60
+#define TWL4030_OFFSET_CNCL_SEL_ARX1	0x00
+#define TWL4030_OFFSET_CNCL_SEL_ARX2	0x20
+#define TWL4030_OFFSET_CNCL_SEL_VRX	0x40
+#define TWL4030_OFFSET_CNCL_SEL_ALL	0x60
+#define TWL4030_MICAMPL_EN		0x10
+#define TWL4030_CKMIC_EN		0x08
+#define TWL4030_AUXL_EN			0x04
+#define TWL4030_HSMIC_EN		0x02
+#define TWL4030_MAINMIC_EN		0x01
+
+/* ANAMICR (0x06) Fields */
+#define TWL4030_MICAMPR_EN		0x10
+#define TWL4030_AUXR_EN			0x04
+#define TWL4030_SUBMIC_EN		0x01
+
+/* AVADC_CTL (0x07) Fields */
+#define TWL4030_ADCL_EN			0x08
+#define TWL4030_AVADC_CLK_PRIORITY	0x04
+#define TWL4030_ADCR_EN			0x02
+
+/* TWL4030_REG_ADCMICSEL (0x08) Fields */
+#define TWL4030_DIGMIC1_EN		0x08
+#define TWL4030_TX2IN_SEL		0x04
+#define TWL4030_DIGMIC0_EN		0x02
+#define TWL4030_TX1IN_SEL		0x01
+
+/* AUDIO_IF (0x0E) Fields */
+#define TWL4030_AIF_SLAVE_EN		0x80
+#define TWL4030_DATA_WIDTH		0x60
+#define TWL4030_DATA_WIDTH_16S_16W	0x00
+#define TWL4030_DATA_WIDTH_32S_16W	0x40
+#define TWL4030_DATA_WIDTH_32S_24W	0x60
+#define TWL4030_AIF_FORMAT		0x18
+#define TWL4030_AIF_FORMAT_CODEC	0x00
+#define TWL4030_AIF_FORMAT_LEFT		0x08
+#define TWL4030_AIF_FORMAT_RIGHT	0x10
+#define TWL4030_AIF_FORMAT_TDM		0x18
+#define TWL4030_AIF_TRI_EN		0x04
+#define TWL4030_CLK256FS_EN		0x02
+#define TWL4030_AIF_EN			0x01
+
+/* VOICE_IF (0x0F) Fields */
+#define TWL4030_VIF_SLAVE_EN		0x80
+#define TWL4030_VIF_DIN_EN		0x40
+#define TWL4030_VIF_DOUT_EN		0x20
+#define TWL4030_VIF_SWAP		0x10
+#define TWL4030_VIF_FORMAT		0x08
+#define TWL4030_VIF_TRI_EN		0x04
+#define TWL4030_VIF_SUB_EN		0x02
+#define TWL4030_VIF_EN			0x01
+
+/* EAR_CTL (0x21) */
+#define TWL4030_EAR_GAIN		0x30
+
+/* HS_GAIN_SET (0x23) Fields */
+#define TWL4030_HSR_GAIN		0x0C
+#define TWL4030_HSR_GAIN_PWR_DOWN	0x00
+#define TWL4030_HSR_GAIN_PLUS_6DB	0x04
+#define TWL4030_HSR_GAIN_0DB		0x08
+#define TWL4030_HSR_GAIN_MINUS_6DB	0x0C
+#define TWL4030_HSL_GAIN		0x03
+#define TWL4030_HSL_GAIN_PWR_DOWN	0x00
+#define TWL4030_HSL_GAIN_PLUS_6DB	0x01
+#define TWL4030_HSL_GAIN_0DB		0x02
+#define TWL4030_HSL_GAIN_MINUS_6DB	0x03
+
+/* HS_POPN_SET (0x24) Fields */
+#define TWL4030_VMID_EN			0x40
+#define	TWL4030_EXTMUTE			0x20
+#define TWL4030_RAMP_DELAY		0x1C
+#define TWL4030_RAMP_DELAY_20MS		0x00
+#define TWL4030_RAMP_DELAY_40MS		0x04
+#define TWL4030_RAMP_DELAY_81MS		0x08
+#define TWL4030_RAMP_DELAY_161MS	0x0C
+#define TWL4030_RAMP_DELAY_323MS	0x10
+#define TWL4030_RAMP_DELAY_645MS	0x14
+#define TWL4030_RAMP_DELAY_1291MS	0x18
+#define TWL4030_RAMP_DELAY_2581MS	0x1C
+#define TWL4030_RAMP_EN			0x02
+
+/* PREDL_CTL (0x25) */
+#define TWL4030_PREDL_GAIN		0x30
+
+/* PREDR_CTL (0x26) */
+#define TWL4030_PREDR_GAIN		0x30
+
+/* PRECKL_CTL (0x27) */
+#define TWL4030_PRECKL_GAIN		0x30
+
+/* PRECKR_CTL (0x28) */
+#define TWL4030_PRECKR_GAIN		0x30
+
+/* HFL_CTL (0x29, 0x2A) Fields */
+#define TWL4030_HF_CTL_HB_EN		0x04
+#define TWL4030_HF_CTL_LOOP_EN		0x08
+#define TWL4030_HF_CTL_RAMP_EN		0x10
+#define TWL4030_HF_CTL_REF_EN		0x20
+
+/* APLL_CTL (0x3A) Fields */
+#define TWL4030_APLL_EN			0x10
+#define TWL4030_APLL_INFREQ		0x0F
+#define TWL4030_APLL_INFREQ_19200KHZ	0x05
+#define TWL4030_APLL_INFREQ_26000KHZ	0x06
+#define TWL4030_APLL_INFREQ_38400KHZ	0x0F
+
+/* REG_MISC_SET_1 (0x3E) Fields */
+#define TWL4030_CLK64_EN		0x80
+#define TWL4030_SCRAMBLE_EN		0x40
+#define TWL4030_FMLOOP_EN		0x20
+#define TWL4030_SMOOTH_ANAVOL_EN	0x02
+#define TWL4030_DIGMIC_LR_SWAP_EN	0x01
+
+/* VIBRA_CTL (0x45) */
+#define TWL4030_VIBRA_EN		0x01
+#define TWL4030_VIBRA_DIR		0x02
+#define TWL4030_VIBRA_AUDIO_SEL_L1	(0x00 << 2)
+#define TWL4030_VIBRA_AUDIO_SEL_R1	(0x01 << 2)
+#define TWL4030_VIBRA_AUDIO_SEL_L2	(0x02 << 2)
+#define TWL4030_VIBRA_AUDIO_SEL_R2	(0x03 << 2)
+#define TWL4030_VIBRA_SEL		0x10
+#define TWL4030_VIBRA_DIR_SEL		0x20
+
+/* TWL4030 codec resource IDs */
+enum twl4030_codec_res {
+	TWL4030_CODEC_RES_POWER = 0,
+	TWL4030_CODEC_RES_APLL,
+	TWL4030_CODEC_RES_MAX,
+};
+
+int twl4030_codec_disable_resource(enum twl4030_codec_res id);
+int twl4030_codec_enable_resource(enum twl4030_codec_res id);
+
+#endif	/* End of __TWL4030_CODEC_H__ */
-- 
cgit v1.2.3


From 26276069d2f51955cf549faab5d3a71a4b37ba23 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@nokia.com>
Date: Wed, 4 Nov 2009 09:58:17 +0200
Subject: MFD: TWL4030: Add audio_mclk to the codec platform data

Add audio_mclk to the platform data struct for the
twl4030-codec MFD driver.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@nokia.com>
Acked-by: Samuel Ortiz <sameo@linux.intel.com>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 include/linux/i2c/twl4030.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/i2c/twl4030.h b/include/linux/i2c/twl4030.h
index ba61add3e05a..5306a759cbde 100644
--- a/include/linux/i2c/twl4030.h
+++ b/include/linux/i2c/twl4030.h
@@ -414,6 +414,7 @@ struct twl4030_codec_vibra_data {
 };
 
 struct twl4030_codec_data {
+	unsigned int	audio_mclk;
 	struct twl4030_codec_audio_data		*audio;
 	struct twl4030_codec_vibra_data		*vibra;
 };
-- 
cgit v1.2.3


From cfaf6d2c1cb231f77e24109cb1460db429608bd4 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@nokia.com>
Date: Wed, 4 Nov 2009 09:58:19 +0200
Subject: MFD: twl4030-codec: APLL_INFREQ handling in the MFD driver

Configure the APLL_INFREQ field in the APLL_CTL register
based on the platform data.
Provide also a function for childs to query the audio_mclk
frequency.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@nokia.com>
Acked-by: Samuel Ortiz <sameo@linux.intel.com>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/mfd/twl4030-codec.c       | 35 +++++++++++++++++++++++++++++++++++
 include/linux/mfd/twl4030-codec.h |  1 +
 2 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/twl4030-codec.c b/drivers/mfd/twl4030-codec.c
index 97103078dc2a..77b914907d7c 100644
--- a/drivers/mfd/twl4030-codec.c
+++ b/drivers/mfd/twl4030-codec.c
@@ -41,6 +41,7 @@ struct twl4030_codec_resource {
 };
 
 struct twl4030_codec {
+	unsigned int audio_mclk;
 	struct mutex mutex;
 	struct twl4030_codec_resource resource[TWL4030_CODEC_RES_MAX];
 	struct mfd_cell cells[TWL4030_CODEC_CELLS];
@@ -145,12 +146,45 @@ int twl4030_codec_disable_resource(unsigned id)
 }
 EXPORT_SYMBOL_GPL(twl4030_codec_disable_resource);
 
+unsigned int twl4030_codec_get_mclk(void)
+{
+	struct twl4030_codec *codec = platform_get_drvdata(twl4030_codec_dev);
+
+	return codec->audio_mclk;
+}
+EXPORT_SYMBOL_GPL(twl4030_codec_get_mclk);
+
 static int __devinit twl4030_codec_probe(struct platform_device *pdev)
 {
 	struct twl4030_codec *codec;
 	struct twl4030_codec_data *pdata = pdev->dev.platform_data;
 	struct mfd_cell *cell = NULL;
 	int ret, childs = 0;
+	u8 val;
+
+	if (!pdata) {
+		dev_err(&pdev->dev, "Platform data is missing\n");
+		return -EINVAL;
+	}
+
+	/* Configure APLL_INFREQ and disable APLL if enabled */
+	val = 0;
+	switch (pdata->audio_mclk) {
+	case 19200000:
+		val |= TWL4030_APLL_INFREQ_19200KHZ;
+		break;
+	case 26000000:
+		val |= TWL4030_APLL_INFREQ_26000KHZ;
+		break;
+	case 38400000:
+		val |= TWL4030_APLL_INFREQ_38400KHZ;
+		break;
+	default:
+		dev_err(&pdev->dev, "Invalid audio_mclk\n");
+		return -EINVAL;
+	}
+	twl4030_i2c_write_u8(TWL4030_MODULE_AUDIO_VOICE,
+					val, TWL4030_REG_APLL_CTL);
 
 	codec = kzalloc(sizeof(struct twl4030_codec), GFP_KERNEL);
 	if (!codec)
@@ -160,6 +194,7 @@ static int __devinit twl4030_codec_probe(struct platform_device *pdev)
 
 	twl4030_codec_dev = pdev;
 	mutex_init(&codec->mutex);
+	codec->audio_mclk = pdata->audio_mclk;
 
 	/* Codec power */
 	codec->resource[TWL4030_CODEC_RES_POWER].reg = TWL4030_REG_CODEC_MODE;
diff --git a/include/linux/mfd/twl4030-codec.h b/include/linux/mfd/twl4030-codec.h
index ef0a3041d759..2ec317c68e59 100644
--- a/include/linux/mfd/twl4030-codec.h
+++ b/include/linux/mfd/twl4030-codec.h
@@ -267,5 +267,6 @@ enum twl4030_codec_res {
 
 int twl4030_codec_disable_resource(enum twl4030_codec_res id);
 int twl4030_codec_enable_resource(enum twl4030_codec_res id);
+unsigned int twl4030_codec_get_mclk(void);
 
 #endif	/* End of __TWL4030_CODEC_H__ */
-- 
cgit v1.2.3


From 76b5c84f77c3abc92a3c4e185e7b78f17a0ed204 Mon Sep 17 00:00:00 2001
From: Jani Nikula <ext-jani.1.nikula@nokia.com>
Date: Thu, 5 Nov 2009 22:59:46 -0800
Subject: Input: add new keycodes useful in mobile devices

Add new codes for camera focus key, and camera lens cover, keypad slide,
front proximity switches.

Signed-off-by: Jani Nikula <ext-jani.1.nikula@nokia.com>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 include/linux/input.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/input.h b/include/linux/input.h
index c2b1a7d244d9..84b501ab0d8f 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -595,6 +595,8 @@ struct input_absinfo {
 #define KEY_NUMERIC_STAR	0x20a
 #define KEY_NUMERIC_POUND	0x20b
 
+#define KEY_CAMERA_FOCUS	0x210
+
 /* We avoid low common keys in module aliases so they don't get huge. */
 #define KEY_MIN_INTERESTING	KEY_MUTE
 #define KEY_MAX			0x2ff
@@ -677,6 +679,9 @@ struct input_absinfo {
 #define SW_LINEOUT_INSERT	0x06  /* set = inserted */
 #define SW_JACK_PHYSICAL_INSERT 0x07  /* set = mechanical switch set */
 #define SW_VIDEOOUT_INSERT	0x08  /* set = inserted */
+#define SW_CAMERA_LENS_COVER	0x09  /* set = lens covered */
+#define SW_KEYPAD_SLIDE		0x0a  /* set = keypad slide out */
+#define SW_FRONT_PROXIMITY	0x0b  /* set = front proximity sensor active */
 #define SW_MAX			0x0f
 #define SW_CNT			(SW_MAX+1)
 
-- 
cgit v1.2.3


From c4832c7bbc3f7a4813347e871d7238651bf437d3 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 23 Nov 2009 10:34:39 +0100
Subject: netfilter: nf_ct_tcp: improve out-of-sync situation in TCP tracking

Without this patch, if we receive a SYN packet from the client while
the firewall is out-of-sync, we let it go through. Then, if we see
the SYN/ACK reply coming from the server, we destroy the conntrack
entry and drop the packet to trigger a new retransmission. Then,
the retransmision from the client is used to start a new clean
session.

This patch improves the current handling. Basically, if we see an
unexpected SYN packet, we annotate the TCP options. Then, if we
see the reply SYN/ACK, this means that the firewall was indeed
out-of-sync. Therefore, we set a clean new session from the existing
entry based on the annotated values.

This patch adds two new 8-bits fields that fit in a 16-bits gap of
the ip_ct_tcp structure.

This patch is particularly useful for conntrackd since the
asynchronous nature of the state-synchronization allows to have
backup nodes that are not perfect copies of the master. This helps
to improve the recovery under some worst-case scenarios.

I have tested this by creating lots of conntrack entries in wrong
state:

for ((i=1024;i<65535;i++)); do conntrack -I -p tcp -s 192.168.2.101 -d 192.168.2.2 --sport $i --dport 80 -t 800 --state ESTABLISHED -u ASSURED,SEEN_REPLY; done

Then, I make some TCP connections:

$ echo GET / | nc 192.168.2.2 80

The events show the result:

 [UPDATE] tcp      6 60 SYN_RECV src=192.168.2.101 dst=192.168.2.2 sport=33220 dport=80 src=192.168.2.2 dst=192.168.2.101 sport=80 dport=33220 [ASSURED]
 [UPDATE] tcp      6 432000 ESTABLISHED src=192.168.2.101 dst=192.168.2.2 sport=33220 dport=80 src=192.168.2.2 dst=192.168.2.101 sport=80 dport=33220 [ASSURED]
 [UPDATE] tcp      6 120 FIN_WAIT src=192.168.2.101 dst=192.168.2.2 sport=33220 dport=80 src=192.168.2.2 dst=192.168.2.101 sport=80 dport=33220 [ASSURED]
 [UPDATE] tcp      6 30 LAST_ACK src=192.168.2.101 dst=192.168.2.2 sport=33220 dport=80 src=192.168.2.2 dst=192.168.2.101 sport=80 dport=33220 [ASSURED]
 [UPDATE] tcp      6 120 TIME_WAIT src=192.168.2.101 dst=192.168.2.2 sport=33220 dport=80 src=192.168.2.2 dst=192.168.2.101 sport=80 dport=33220 [ASSURED]

and tcpdump shows no retransmissions:

20:47:57.271951 IP 192.168.2.101.33221 > 192.168.2.2.www: S 435402517:435402517(0) win 5840 <mss 1460,sackOK,timestamp 4294961827 0,nop,wscale 6>
20:47:57.273538 IP 192.168.2.2.www > 192.168.2.101.33221: S 3509927945:3509927945(0) ack 435402518 win 5792 <mss 1460,sackOK,timestamp 235681024 4294961827,nop,wscale 4>
20:47:57.273608 IP 192.168.2.101.33221 > 192.168.2.2.www: . ack 3509927946 win 92 <nop,nop,timestamp 4294961827 235681024>
20:47:57.273693 IP 192.168.2.101.33221 > 192.168.2.2.www: P 435402518:435402524(6) ack 3509927946 win 92 <nop,nop,timestamp 4294961827 235681024>
20:47:57.275492 IP 192.168.2.2.www > 192.168.2.101.33221: . ack 435402524 win 362 <nop,nop,timestamp 235681024 4294961827>
20:47:57.276492 IP 192.168.2.2.www > 192.168.2.101.33221: P 3509927946:3509928082(136) ack 435402524 win 362 <nop,nop,timestamp 235681025 4294961827>
20:47:57.276515 IP 192.168.2.101.33221 > 192.168.2.2.www: . ack 3509928082 win 108 <nop,nop,timestamp 4294961828 235681025>
20:47:57.276521 IP 192.168.2.2.www > 192.168.2.101.33221: F 3509928082:3509928082(0) ack 435402524 win 362 <nop,nop,timestamp 235681025 4294961827>
20:47:57.277369 IP 192.168.2.101.33221 > 192.168.2.2.www: F 435402524:435402524(0) ack 3509928083 win 108 <nop,nop,timestamp 4294961828 235681025>
20:47:57.279491 IP 192.168.2.2.www > 192.168.2.101.33221: . ack 435402525 win 362 <nop,nop,timestamp 235681025 4294961828>

I also added a rule to log invalid packets, with no occurrences  :-) .

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/nf_conntrack_tcp.h |  3 ++
 net/netfilter/nf_conntrack_proto_tcp.c     | 51 ++++++++++++++++++++++++------
 2 files changed, 44 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nf_conntrack_tcp.h b/include/linux/netfilter/nf_conntrack_tcp.h
index 4352feed2377..ece22e94dcbf 100644
--- a/include/linux/netfilter/nf_conntrack_tcp.h
+++ b/include/linux/netfilter/nf_conntrack_tcp.h
@@ -67,6 +67,9 @@ struct ip_ct_tcp
 	u_int32_t	last_ack;	/* Last sequence number seen in opposite dir */
 	u_int32_t	last_end;	/* Last seq + len */
 	u_int16_t	last_win;	/* Last window advertisement seen in dir */
+	/* For SYN packets while we may be out-of-sync */
+	u_int8_t	last_wscale;	/* Last window scaling factor seen */
+	u_int8_t	last_flags;	/* Last flags set */
 };
 
 #endif /* __KERNEL__ */
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 97a82ba75376..9cc6b5cb06af 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -908,23 +908,54 @@ static int tcp_packet(struct nf_conn *ct,
 			/* b) This SYN/ACK acknowledges a SYN that we earlier
 			 * ignored as invalid. This means that the client and
 			 * the server are both in sync, while the firewall is
-			 * not. We kill this session and block the SYN/ACK so
-			 * that the client cannot but retransmit its SYN and
-			 * thus initiate a clean new session.
+			 * not. We get in sync from the previously annotated
+			 * values.
 			 */
-			spin_unlock_bh(&ct->lock);
-			if (LOG_INVALID(net, IPPROTO_TCP))
-				nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
-					  "nf_ct_tcp: killing out of sync session ");
-			nf_ct_kill(ct);
-			return NF_DROP;
+			old_state = TCP_CONNTRACK_SYN_SENT;
+			new_state = TCP_CONNTRACK_SYN_RECV;
+			ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end =
+				ct->proto.tcp.last_end;
+			ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend =
+				ct->proto.tcp.last_end;
+			ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin =
+				ct->proto.tcp.last_win == 0 ?
+					1 : ct->proto.tcp.last_win;
+			ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale =
+				ct->proto.tcp.last_wscale;
+			ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags =
+				ct->proto.tcp.last_flags;
+			memset(&ct->proto.tcp.seen[dir], 0,
+			       sizeof(struct ip_ct_tcp_state));
+			break;
 		}
 		ct->proto.tcp.last_index = index;
 		ct->proto.tcp.last_dir = dir;
 		ct->proto.tcp.last_seq = ntohl(th->seq);
 		ct->proto.tcp.last_end =
 		    segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
-
+		ct->proto.tcp.last_win = ntohs(th->window);
+
+		/* a) This is a SYN in ORIGINAL. The client and the server
+		 * may be in sync but we are not. In that case, we annotate
+		 * the TCP options and let the packet go through. If it is a
+		 * valid SYN packet, the server will reply with a SYN/ACK, and
+		 * then we'll get in sync. Otherwise, the server ignores it. */
+		if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) {
+			struct ip_ct_tcp_state seen = {};
+
+			ct->proto.tcp.last_flags =
+			ct->proto.tcp.last_wscale = 0;
+			tcp_options(skb, dataoff, th, &seen);
+			if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
+				ct->proto.tcp.last_flags |=
+					IP_CT_TCP_FLAG_WINDOW_SCALE;
+				ct->proto.tcp.last_wscale = seen.td_scale;
+			}
+			if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) {
+				ct->proto.tcp.last_flags |=
+					IP_CT_TCP_FLAG_SACK_PERM;
+			}
+		}
 		spin_unlock_bh(&ct->lock);
 		if (LOG_INVALID(net, IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
-- 
cgit v1.2.3


From 4ed7c92d68a5387ba5f7030dc76eab03558e27f5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 23 Nov 2009 11:37:29 +0100
Subject: perf_events: Undo some recursion damage

Make perf_swevent_get_recursion_context return a context number
and disable preemption.

This could be used to remove the IRQ disable from the trace bit
and index the per-cpu buffer with.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20091123103819.993226816@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h    |  8 ++---
 include/trace/ftrace.h        | 17 ++++++-----
 kernel/perf_event.c           | 71 +++++++++++++++++++------------------------
 kernel/trace/trace_kprobe.c   | 14 +++++----
 kernel/trace/trace_syscalls.c | 14 +++++----
 5 files changed, 61 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 74e98b1d3391..43adbd7f0010 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -874,8 +874,8 @@ extern int perf_output_begin(struct perf_output_handle *handle,
 extern void perf_output_end(struct perf_output_handle *handle);
 extern void perf_output_copy(struct perf_output_handle *handle,
 			     const void *buf, unsigned int len);
-extern int perf_swevent_get_recursion_context(int **recursion);
-extern void perf_swevent_put_recursion_context(int *recursion);
+extern int perf_swevent_get_recursion_context(void);
+extern void perf_swevent_put_recursion_context(int rctx);
 #else
 static inline void
 perf_event_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -904,8 +904,8 @@ static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
 static inline void perf_event_comm(struct task_struct *tsk)		{ }
 static inline void perf_event_fork(struct task_struct *tsk)		{ }
 static inline void perf_event_init(void)				{ }
-static int perf_swevent_get_recursion_context(int **recursion)	{ return -1; }
-static void perf_swevent_put_recursion_context(int *recursion)		{ }
+static inline int  perf_swevent_get_recursion_context(void)  { return -1; }
+static inline void perf_swevent_put_recursion_context(int rctx)		{ }
 
 #endif
 
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index c222ef5238bf..c3417c13e3ed 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -724,8 +724,8 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 static void ftrace_profile_##call(proto)				\
 {									\
 	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
-	extern int perf_swevent_get_recursion_context(int **recursion); \
-	extern void perf_swevent_put_recursion_context(int *recursion); \
+	extern int perf_swevent_get_recursion_context(void);		\
+	extern void perf_swevent_put_recursion_context(int rctx);	\
 	struct ftrace_event_call *event_call = &event_##call;		\
 	extern void perf_tp_event(int, u64, u64, void *, int);		\
 	struct ftrace_raw_##call *entry;				\
@@ -736,8 +736,8 @@ static void ftrace_profile_##call(proto)				\
 	int __data_size;						\
 	char *trace_buf;						\
 	char *raw_data;							\
-	int *recursion;							\
 	int __cpu;							\
+	int rctx;							\
 	int pc;								\
 									\
 	pc = preempt_count();						\
@@ -753,8 +753,9 @@ static void ftrace_profile_##call(proto)				\
 									\
 	local_irq_save(irq_flags);					\
 									\
-	if (perf_swevent_get_recursion_context(&recursion))		\
-		goto end_recursion;						\
+	rctx = perf_swevent_get_recursion_context();			\
+	if (rctx < 0)							\
+		goto end_recursion;					\
 									\
 	__cpu = smp_processor_id();					\
 									\
@@ -781,9 +782,9 @@ static void ftrace_profile_##call(proto)				\
 	perf_tp_event(event_call->id, __addr, __count, entry,		\
 			     __entry_size);				\
 									\
-end:								\
-	perf_swevent_put_recursion_context(recursion);			\
-end_recursion:									\
+end:									\
+	perf_swevent_put_recursion_context(rctx);			\
+end_recursion:								\
 	local_irq_restore(irq_flags);					\
 									\
 }
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 50f11b5f8c3d..0b0d5f72fe7d 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3869,45 +3869,50 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx,
 	}
 }
 
-/*
- * Must be called with preemption disabled
- */
-int perf_swevent_get_recursion_context(int **recursion)
+int perf_swevent_get_recursion_context(void)
 {
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+	int rctx;
 
 	if (in_nmi())
-		*recursion = &cpuctx->recursion[3];
+		rctx = 3;
 	else if (in_irq())
-		*recursion = &cpuctx->recursion[2];
+		rctx = 2;
 	else if (in_softirq())
-		*recursion = &cpuctx->recursion[1];
+		rctx = 1;
 	else
-		*recursion = &cpuctx->recursion[0];
+		rctx = 0;
 
-	if (**recursion)
+	if (cpuctx->recursion[rctx]) {
+		put_cpu_var(perf_cpu_context);
 		return -1;
+	}
 
-	(**recursion)++;
+	cpuctx->recursion[rctx]++;
+	barrier();
 
-	return 0;
+	return rctx;
 }
 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
 
-void perf_swevent_put_recursion_context(int *recursion)
+void perf_swevent_put_recursion_context(int rctx)
 {
-	(*recursion)--;
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	barrier();
+	cpuctx->recursion[rctx]++;
+	put_cpu_var(perf_cpu_context);
 }
 EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
 
-static void __do_perf_sw_event(enum perf_type_id type, u32 event_id,
-			       u64 nr, int nmi,
-			       struct perf_sample_data *data,
-			       struct pt_regs *regs)
+static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
+				    u64 nr, int nmi,
+				    struct perf_sample_data *data,
+				    struct pt_regs *regs)
 {
+	struct perf_cpu_context *cpuctx;
 	struct perf_event_context *ctx;
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 
+	cpuctx = &__get_cpu_var(perf_cpu_context);
 	rcu_read_lock();
 	perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
 				 nr, nmi, data, regs);
@@ -3921,34 +3926,22 @@ static void __do_perf_sw_event(enum perf_type_id type, u32 event_id,
 	rcu_read_unlock();
 }
 
-static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
-				    u64 nr, int nmi,
-				    struct perf_sample_data *data,
-				    struct pt_regs *regs)
-{
-	int *recursion;
-
-	preempt_disable();
-
-	if (perf_swevent_get_recursion_context(&recursion))
-		goto out;
-
-	__do_perf_sw_event(type, event_id, nr, nmi, data, regs);
-
-	perf_swevent_put_recursion_context(recursion);
-out:
-	preempt_enable();
-}
-
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
 			    struct pt_regs *regs, u64 addr)
 {
 	struct perf_sample_data data;
+	int rctx;
+
+	rctx = perf_swevent_get_recursion_context();
+	if (rctx < 0)
+		return;
 
 	data.addr = addr;
 	data.raw  = NULL;
 
 	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
+
+	perf_swevent_put_recursion_context(rctx);
 }
 
 static void perf_swevent_read(struct perf_event *event)
@@ -4172,7 +4165,7 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
 		regs = task_pt_regs(current);
 
 	/* Trace events already protected against recursion */
-	__do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
+	do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
 				&data, regs);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 22e6f68b05b3..79ce6a2bd74f 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1213,7 +1213,7 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 	unsigned long irq_flags;
 	char *trace_buf;
 	char *raw_data;
-	int *recursion;
+	int rctx;
 
 	pc = preempt_count();
 	__size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
@@ -1229,7 +1229,8 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 	 */
 	local_irq_save(irq_flags);
 
-	if (perf_swevent_get_recursion_context(&recursion))
+	rctx = perf_swevent_get_recursion_context();
+	if (rctx < 0)
 		goto end_recursion;
 
 	__cpu = smp_processor_id();
@@ -1258,7 +1259,7 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
 	perf_tp_event(call->id, entry->ip, 1, entry, size);
 
 end:
-	perf_swevent_put_recursion_context(recursion);
+	perf_swevent_put_recursion_context(rctx);
 end_recursion:
 	local_irq_restore(irq_flags);
 
@@ -1276,8 +1277,8 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 	int size, __size, i, pc, __cpu;
 	unsigned long irq_flags;
 	char *trace_buf;
-	int *recursion;
 	char *raw_data;
+	int rctx;
 
 	pc = preempt_count();
 	__size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
@@ -1293,7 +1294,8 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 	 */
 	local_irq_save(irq_flags);
 
-	if (perf_swevent_get_recursion_context(&recursion))
+	rctx = perf_swevent_get_recursion_context();
+	if (rctx < 0)
 		goto end_recursion;
 
 	__cpu = smp_processor_id();
@@ -1323,7 +1325,7 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
 	perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
 
 end:
-	perf_swevent_put_recursion_context(recursion);
+	perf_swevent_put_recursion_context(rctx);
 end_recursion:
 	local_irq_restore(irq_flags);
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 41b6dd963daa..9189cbe86079 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -481,8 +481,8 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 	unsigned long flags;
 	char *trace_buf;
 	char *raw_data;
-	int *recursion;
 	int syscall_nr;
+	int rctx;
 	int size;
 	int cpu;
 
@@ -506,7 +506,8 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 	/* Protect the per cpu buffer, begin the rcu read side */
 	local_irq_save(flags);
 
-	if (perf_swevent_get_recursion_context(&recursion))
+	rctx = perf_swevent_get_recursion_context();
+	if (rctx < 0)
 		goto end_recursion;
 
 	cpu = smp_processor_id();
@@ -530,7 +531,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 	perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
 
 end:
-	perf_swevent_put_recursion_context(recursion);
+	perf_swevent_put_recursion_context(rctx);
 end_recursion:
 	local_irq_restore(flags);
 }
@@ -582,7 +583,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 	int syscall_nr;
 	char *trace_buf;
 	char *raw_data;
-	int *recursion;
+	int rctx;
 	int size;
 	int cpu;
 
@@ -609,7 +610,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 	/* Protect the per cpu buffer, begin the rcu read side */
 	local_irq_save(flags);
 
-	if (perf_swevent_get_recursion_context(&recursion))
+	rctx = perf_swevent_get_recursion_context();
+	if (rctx < 0)
 		goto end_recursion;
 
 	cpu = smp_processor_id();
@@ -634,7 +636,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 	perf_tp_event(sys_data->exit_id, 0, 1, rec, size);
 
 end:
-	perf_swevent_put_recursion_context(recursion);
+	perf_swevent_put_recursion_context(rctx);
 end_recursion:
 	local_irq_restore(flags);
 }
-- 
cgit v1.2.3


From e6db4876575f3fdd5b1df2cbff826df95ab9af6a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 23 Nov 2009 15:42:32 +0100
Subject: hw-breakpoints: Include only linux/perf_event.h from kernel part of
 bp headers

As userspace only needs the breakpoints enum types from the
breakpoints headers.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Prasad <prasad@linux.vnet.ibm.com>
LKML-Reference: <1258987355-8751-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/hw_breakpoint.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index 4659e0c55ea6..76a48ab9a81e 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -1,8 +1,6 @@
 #ifndef _LINUX_HW_BREAKPOINT_H
 #define _LINUX_HW_BREAKPOINT_H
 
-#include <linux/perf_event.h>
-
 enum {
 	HW_BREAKPOINT_LEN_1 = 1,
 	HW_BREAKPOINT_LEN_2 = 2,
@@ -19,6 +17,8 @@ enum {
 #ifdef __KERNEL__
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 
+#include <linux/perf_event.h>
+
 static inline unsigned long hw_breakpoint_addr(struct perf_event *bp)
 {
 	return bp->attr.bp_addr;
-- 
cgit v1.2.3


From 475cba4ec8ee6b427cc3567692e6f48dd483c069 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Mon, 23 Nov 2009 15:53:52 -0500
Subject: sctp: implement definition for SACK-IMMEDIATELY extension

This patch implement the definition for SACK-IMMEDIATELY
extension.

Section 3.  The I-bit in the DATA Chunk Header

   The following Figure 1 shows the extended DATA chunk.

    0                   1                   2                   3
    0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   |   Type = 0    |  Res  |I|U|B|E|           Length              |
   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   |                              TSN                              |
   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   |        Stream Identifier      |     Stream Sequence Number    |
   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   |                  Payload Protocol Identifier                  |
   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
   \                                                               \
   /                           User Data                           /
   \                                                               \
   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+

                                 Figure 1

   The only difference between the DATA chunk in Figure 1 and the DATA
   chunk defined in [RFC4960] is the addition of the I-bit in the flags
   field of the chunk header.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
---
 include/linux/sctp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index b464b9d3d242..c20d3ce673c0 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -242,6 +242,7 @@ enum {
 	SCTP_DATA_FIRST_FRAG	= 0x02,
 	SCTP_DATA_NOT_FRAG	= 0x03,
 	SCTP_DATA_UNORDERED	= 0x04,
+	SCTP_DATA_SACK_IMM	= 0x08,
 };
 enum { SCTP_DATA_FRAG_MASK = 0x03, };
 
-- 
cgit v1.2.3


From fa7c27ee9394fc0d52404b2a89882e95868a60b9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 23 Nov 2009 22:30:12 +0100
Subject: hw-breakpoints: Fix misordered ifdef

Fix a misplaced ifdef. We need the perf event headers also in
off-case to avoid the following build error:

 include/linux/hw_breakpoint.h:94: error: expected declaration specifiers or '...' before 'perf_callback_t'
 include/linux/hw_breakpoint.h:102: error: expected declaration specifiers or '...' before 'perf_callback_t'
 include/linux/hw_breakpoint.h:109: error: expected declaration specifiers or '...' before 'perf_callback_t'
 include/linux/hw_breakpoint.h:116: error: expected declaration specifiers or '...' before 'perf_callback_t'

Reported-by: Kisskb-bot by Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Prasad <prasad@linux.vnet.ibm.com>
LKML-Reference: <1259011812-8093-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/hw_breakpoint.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index 76a48ab9a81e..c9f7f7c7b0e0 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -15,10 +15,11 @@ enum {
 };
 
 #ifdef __KERNEL__
-#ifdef CONFIG_HAVE_HW_BREAKPOINT
 
 #include <linux/perf_event.h>
 
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+
 static inline unsigned long hw_breakpoint_addr(struct perf_event *bp)
 {
 	return bp->attr.bp_addr;
-- 
cgit v1.2.3


From b3a222e52e4d4be77cc4520a57af1a4a0d8222d1 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Mon, 23 Nov 2009 16:21:30 -0600
Subject: remove CONFIG_SECURITY_FILE_CAPABILITIES compile option

As far as I know, all distros currently ship kernels with default
CONFIG_SECURITY_FILE_CAPABILITIES=y.  Since having the option on
leaves a 'no_file_caps' option to boot without file capabilities,
the main reason to keep the option is that turning it off saves
you (on my s390x partition) 5k.  In particular, vmlinux sizes
came to:

without patch fscaps=n:		 	53598392
without patch fscaps=y:		 	53603406
with this patch applied:		53603342

with the security-next tree.

Against this we must weigh the fact that there is no simple way for
userspace to figure out whether file capabilities are supported,
while things like per-process securebits, capability bounding
sets, and adding bits to pI if CAP_SETPCAP is in pE are not supported
with SECURITY_FILE_CAPABILITIES=n, leaving a bit of a problem for
applications wanting to know whether they can use them and/or why
something failed.

It also adds another subtly different set of semantics which we must
maintain at the risk of severe security regressions.

So this patch removes the SECURITY_FILE_CAPABILITIES compile
option.  It drops the kernel size by about 50k over the stock
SECURITY_FILE_CAPABILITIES=y kernel, by removing the
cap_limit_ptraced_target() function.

Changelog:
	Nov 20: remove cap_limit_ptraced_target() as it's logic
		was ifndef'ed.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Acked-by: Andrew G. Morgan" <morgan@kernel.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/capability.h |  2 --
 include/linux/init_task.h  |  4 ---
 kernel/capability.c        |  2 --
 security/Kconfig           |  9 ------
 security/commoncap.c       | 72 ++--------------------------------------------
 5 files changed, 2 insertions(+), 87 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index c8f2a5f70ed5..39e5ff512fbe 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -92,9 +92,7 @@ struct vfs_cap_data {
 #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
 #define _KERNEL_CAPABILITY_U32S    _LINUX_CAPABILITY_U32S_3
 
-#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
 extern int file_caps_enabled;
-#endif
 
 typedef struct kernel_cap_struct {
 	__u32 cap[_KERNEL_CAPABILITY_U32S];
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 21a6f5d9af22..8d10aa7fd4c9 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -83,16 +83,12 @@ extern struct group_info init_groups;
 #define INIT_IDS
 #endif
 
-#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
 /*
  * Because of the reduced scope of CAP_SETPCAP when filesystem
  * capabilities are in effect, it is safe to allow CAP_SETPCAP to
  * be available in the default configuration.
  */
 # define CAP_INIT_BSET  CAP_FULL_SET
-#else
-# define CAP_INIT_BSET  CAP_INIT_EFF_SET
-#endif
 
 #ifdef CONFIG_TREE_PREEMPT_RCU
 #define INIT_TASK_RCU_PREEMPT(tsk)					\
diff --git a/kernel/capability.c b/kernel/capability.c
index c450375e855f..7f876e60521f 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -29,7 +29,6 @@ EXPORT_SYMBOL(__cap_empty_set);
 EXPORT_SYMBOL(__cap_full_set);
 EXPORT_SYMBOL(__cap_init_eff_set);
 
-#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
 int file_caps_enabled = 1;
 
 static int __init file_caps_disable(char *str)
@@ -38,7 +37,6 @@ static int __init file_caps_disable(char *str)
 	return 1;
 }
 __setup("no_file_caps", file_caps_disable);
-#endif
 
 /*
  * More recent versions of libcap are available from:
diff --git a/security/Kconfig b/security/Kconfig
index 95cc08913ca1..226b9556b25f 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -91,15 +91,6 @@ config SECURITY_PATH
 	  implement pathname based access controls.
 	  If you are unsure how to answer this question, answer N.
 
-config SECURITY_FILE_CAPABILITIES
-	bool "File POSIX Capabilities"
-	default n
-	help
-	  This enables filesystem capabilities, allowing you to give
-	  binaries a subset of root's powers without using setuid 0.
-
-	  If in doubt, answer N.
-
 config INTEL_TXT
 	bool "Enable Intel(R) Trusted Execution Technology (Intel(R) TXT)"
 	depends on HAVE_INTEL_TXT
diff --git a/security/commoncap.c b/security/commoncap.c
index 45b87af4ae5d..f800fdb3de94 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -173,7 +173,6 @@ int cap_capget(struct task_struct *target, kernel_cap_t *effective,
  */
 static inline int cap_inh_is_capped(void)
 {
-#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
 
 	/* they are so limited unless the current task has the CAP_SETPCAP
 	 * capability
@@ -181,7 +180,6 @@ static inline int cap_inh_is_capped(void)
 	if (cap_capable(current, current_cred(), CAP_SETPCAP,
 			SECURITY_CAP_AUDIT) == 0)
 		return 0;
-#endif
 	return 1;
 }
 
@@ -239,8 +237,6 @@ static inline void bprm_clear_caps(struct linux_binprm *bprm)
 	bprm->cap_effective = false;
 }
 
-#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
-
 /**
  * cap_inode_need_killpriv - Determine if inode change affects privileges
  * @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV
@@ -421,49 +417,6 @@ out:
 	return rc;
 }
 
-#else
-int cap_inode_need_killpriv(struct dentry *dentry)
-{
-	return 0;
-}
-
-int cap_inode_killpriv(struct dentry *dentry)
-{
-	return 0;
-}
-
-int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps)
-{
-	memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));
- 	return -ENODATA;
-}
-
-static inline int get_file_caps(struct linux_binprm *bprm, bool *effective)
-{
-	bprm_clear_caps(bprm);
-	return 0;
-}
-#endif
-
-/*
- * Determine whether a exec'ing process's new permitted capabilities should be
- * limited to just what it already has.
- *
- * This prevents processes that are being ptraced from gaining access to
- * CAP_SETPCAP, unless the process they're tracing already has it, and the
- * binary they're executing has filecaps that elevate it.
- *
- *  Returns 1 if they should be limited, 0 if they are not.
- */
-static inline int cap_limit_ptraced_target(void)
-{
-#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
-	if (capable(CAP_SETPCAP))
-		return 0;
-#endif
-	return 1;
-}
-
 /**
  * cap_bprm_set_creds - Set up the proposed credentials for execve().
  * @bprm: The execution parameters, including the proposed creds
@@ -523,9 +476,8 @@ skip:
 			new->euid = new->uid;
 			new->egid = new->gid;
 		}
-		if (cap_limit_ptraced_target())
-			new->cap_permitted = cap_intersect(new->cap_permitted,
-							   old->cap_permitted);
+		new->cap_permitted = cap_intersect(new->cap_permitted,
+						   old->cap_permitted);
 	}
 
 	new->suid = new->fsuid = new->euid;
@@ -739,7 +691,6 @@ int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags)
 	return 0;
 }
 
-#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
 /*
  * Rationale: code calling task_setscheduler, task_setioprio, and
  * task_setnice, assumes that
@@ -820,22 +771,6 @@ static long cap_prctl_drop(struct cred *new, unsigned long cap)
 	return 0;
 }
 
-#else
-int cap_task_setscheduler (struct task_struct *p, int policy,
-			   struct sched_param *lp)
-{
-	return 0;
-}
-int cap_task_setioprio (struct task_struct *p, int ioprio)
-{
-	return 0;
-}
-int cap_task_setnice (struct task_struct *p, int nice)
-{
-	return 0;
-}
-#endif
-
 /**
  * cap_task_prctl - Implement process control functions for this security module
  * @option: The process control function requested
@@ -866,7 +801,6 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 		error = !!cap_raised(new->cap_bset, arg2);
 		goto no_change;
 
-#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
 	case PR_CAPBSET_DROP:
 		error = cap_prctl_drop(new, arg2);
 		if (error < 0)
@@ -917,8 +851,6 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 		error = new->securebits;
 		goto no_change;
 
-#endif /* def CONFIG_SECURITY_FILE_CAPABILITIES */
-
 	case PR_GET_KEEPCAPS:
 		if (issecure(SECURE_KEEP_CAPS))
 			error = 1;
-- 
cgit v1.2.3


From c9286b7e293a1ea054e857ff3f5a23d0ad8d4f36 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 16 Nov 2009 19:50:38 +0100
Subject: locking: Remove unused prototype

commit 910067d1(remove generic__raw_read_trylock()) removed the
implementation but left the prototype around. Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/spinlock.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index f0ca7a7a1757..faf1482028df 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -79,8 +79,6 @@
  */
 #include <linux/spinlock_types.h>
 
-extern int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock);
-
 /*
  * Pull the __raw*() functions/declarations (UP-nondebug doesnt need them):
  */
-- 
cgit v1.2.3


From a49ed0bf427a8328a3296eebedc7697fe5098dbf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 16 Nov 2009 19:57:50 +0100
Subject: locking: Use __[SPIN|RW]_LOCK_UNLOCKED in [spin|rw]_lock_init()

SPIN_LOCK_UNLOCKED and RW_LOCK_UNLOCKED are deprecated. Replace them
with the __*_LOCK_UNLOCKED variants.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/spinlock.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index faf1482028df..71dccfeb0d88 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -100,7 +100,7 @@ do {								\
 
 #else
 # define spin_lock_init(lock)					\
-	do { *(lock) = SPIN_LOCK_UNLOCKED; } while (0)
+	do { *(lock) = __SPIN_LOCK_UNLOCKED(lock); } while (0)
 #endif
 
 #ifdef CONFIG_DEBUG_SPINLOCK
@@ -114,7 +114,7 @@ do {								\
 } while (0)
 #else
 # define rwlock_init(lock)					\
-	do { *(lock) = RW_LOCK_UNLOCKED; } while (0)
+	do { *(lock) = __RW_LOCK_UNLOCKED(lock); } while (0)
 #endif
 
 #define spin_is_locked(lock)	__raw_spin_is_locked(&(lock)->raw_lock)
-- 
cgit v1.2.3


From ad85dfe67bbf13d5fa20764e4ce801a1e6e526d8 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 18 Nov 2009 15:52:51 +0100
Subject: DRBD: Now the code is 8.3.6 + 3 fixes (without compat crap)

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 include/linux/drbd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 18942ad115d9..99a4d76694ed 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -53,7 +53,7 @@
 
 
 extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.3.5"
+#define REL_VERSION "8.3.6"
 #define API_VERSION 88
 #define PRO_VERSION_MIN 86
 #define PRO_VERSION_MAX 91
-- 
cgit v1.2.3


From ff038f5c37c2070829004a0678372766c2b32180 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 18 Nov 2009 20:27:27 -0500
Subject: tracing: Create new TRACE_EVENT_TEMPLATE

There are some places in the kernel that define several tracepoints and
they are all identical besides the name. The code to enable, disable and
record is created for every trace point even if most of the code is
identical.

This patch adds TRACE_EVENT_TEMPLATE that lets the developer create
a template TRACE_EVENT and create trace points with DEFINE_EVENT, which
is based off of a given template. Each trace point used by this
will share most of the code, and bring down the size of the kernel
when there are several duplicate events.

Usage is:

TRACE_EVENT_TEMPLATE(name, proto, args, tstruct, assign, print);

Which would be the same as defining a normal TRACE_EVENT.

To create the trace events that the trace points will use:

DEFINE_EVENT(template, name, proto, args) is done. The template
is the name of the TRACE_EVENT_TEMPLATE to use. The name is the
name of the trace point. The parameters proto and args must be the same
as the proto and args of the template. If they are not the same,
then a compile error will result. I tried hard removing this duplication
but the C preprocessor is not powerful enough (or my CPP magic
experience points is not at a high enough level) to not need them.

A lot of trace events are coming in with new XFS development. Most of
the trace points are identical except for the name. The following shows
the advantage of having TRACE_EVENT_TEMPLATE:

$ size fs/xfs/xfs.o.*
    text          data     bss     dec     hex filename
  452114          2788    3520  458422   6feb6 fs/xfs/xfs.o.old
  638482         38116    3744  680342   a6196 fs/xfs/xfs.o.template
  996954         38116    4480 1039550   fdcbe fs/xfs/xfs.o.trace

xfs.o.old is without any tracepoints.
xfs.o.template uses the new TRACE_EVENT_TEMPLATE.
xfs.o.trace uses the current TRACE_EVENT macros.

Requested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/tracepoint.h   |   4 ++
 include/trace/define_trace.h |   6 ++
 include/trace/ftrace.h       | 149 +++++++++++++++++++++++++++++++------------
 3 files changed, 117 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 2aac8a83e89b..88a5b5a809ec 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -280,6 +280,10 @@ static inline void tracepoint_synchronize_unregister(void)
  * TRACE_EVENT_FN to perform any (un)registration work.
  */
 
+#define TRACE_EVENT_TEMPLATE(name, proto, args, tstruct, assign, print)
+#define DEFINE_EVENT(template, name, proto, args)		\
+	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+
 #define TRACE_EVENT(name, proto, args, struct, assign, print)	\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 #define TRACE_EVENT_FN(name, proto, args, struct,		\
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index 2a4b3bf74033..244985814a43 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -31,6 +31,10 @@
 		assign, print, reg, unreg)			\
 	DEFINE_TRACE_FN(name, reg, unreg)
 
+#undef DEFINE_EVENT
+#define DEFINE_EVENT(template, name, proto, args) \
+	DEFINE_TRACE(name)
+
 #undef DECLARE_TRACE
 #define DECLARE_TRACE(name, proto, args)	\
 	DEFINE_TRACE(name)
@@ -63,6 +67,8 @@
 
 #undef TRACE_EVENT
 #undef TRACE_EVENT_FN
+#undef TRACE_EVENT_TEMPLATE
+#undef DEFINE_EVENT
 #undef TRACE_HEADER_MULTI_READ
 
 /* Only undef what we defined in this file */
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index c3417c13e3ed..2969f65d8002 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -18,6 +18,26 @@
 
 #include <linux/ftrace_event.h>
 
+/*
+ * TRACE_EVENT_TEMPLATE can be used to add a generic function
+ * handlers for events. That is, if all events have the same
+ * parameters and just have distinct trace points.
+ * Each tracepoint can be defined with DEFINE_EVENT and that
+ * will map the TRACE_EVENT_TEMPLATE to the tracepoint.
+ *
+ * TRACE_EVENT is a one to one mapping between tracepoint and template.
+ */
+#undef TRACE_EVENT
+#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
+	TRACE_EVENT_TEMPLATE(name,			       \
+			     PARAMS(proto),		       \
+			     PARAMS(args),		       \
+			     PARAMS(tstruct),		       \
+			     PARAMS(assign),		       \
+			     PARAMS(print));		       \
+	DEFINE_EVENT(name, name, PARAMS(proto), PARAMS(args));
+
+
 #undef __field
 #define __field(type, item)		type	item;
 
@@ -36,13 +56,15 @@
 #undef TP_STRUCT__entry
 #define TP_STRUCT__entry(args...) args
 
-#undef TRACE_EVENT
-#define TRACE_EVENT(name, proto, args, tstruct, assign, print)	\
-	struct ftrace_raw_##name {				\
-		struct trace_entry	ent;			\
-		tstruct						\
-		char			__data[0];		\
-	};							\
+#undef TRACE_EVENT_TEMPLATE
+#define TRACE_EVENT_TEMPLATE(name, proto, args, tstruct, assign, print)	\
+	struct ftrace_raw_##name {					\
+		struct trace_entry	ent;				\
+		tstruct							\
+		char			__data[0];			\
+	};
+#undef DEFINE_EVENT
+#define DEFINE_EVENT(template, name, proto, args)	\
 	static struct ftrace_event_call event_##name
 
 #undef __cpparg
@@ -89,12 +111,15 @@
 #undef __string
 #define __string(item, src) __dynamic_array(char, item, -1)
 
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
+#undef TRACE_EVENT_TEMPLATE
+#define TRACE_EVENT_TEMPLATE(call, proto, args, tstruct, assign, print)	\
 	struct ftrace_data_offsets_##call {				\
 		tstruct;						\
 	};
 
+#undef DEFINE_EVENT
+#define DEFINE_EVENT(template, name, proto, args)
+
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 /*
@@ -170,8 +195,8 @@
 #undef TP_perf_assign
 #define TP_perf_assign(args...)
 
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
+#undef TRACE_EVENT_TEMPLATE
+#define TRACE_EVENT_TEMPLATE(call, proto, args, tstruct, func, print)	\
 static int								\
 ftrace_format_##call(struct ftrace_event_call *unused,			\
 		      struct trace_seq *s)				\
@@ -186,6 +211,9 @@ ftrace_format_##call(struct ftrace_event_call *unused,			\
 	return ret;							\
 }
 
+#undef DEFINE_EVENT
+#define DEFINE_EVENT(template, name, proto, args)
+
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 /*
@@ -255,10 +283,11 @@ ftrace_format_##call(struct ftrace_event_call *unused,			\
 		ftrace_print_symbols_seq(p, value, symbols);		\
 	})
 
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
+#undef TRACE_EVENT_TEMPLATE
+#define TRACE_EVENT_TEMPLATE(call, proto, args, tstruct, assign, print)	\
 static enum print_line_t						\
-ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
+ftrace_raw_output_id_##call(int event_id, const char *name,		\
+			    struct trace_iterator *iter, int flags)	\
 {									\
 	struct trace_seq *s = &iter->seq;				\
 	struct ftrace_raw_##call *field;				\
@@ -268,7 +297,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 									\
 	entry = iter->ent;						\
 									\
-	if (entry->type != event_##call.id) {				\
+	if (entry->type != event_id) {					\
 		WARN_ON_ONCE(1);					\
 		return TRACE_TYPE_UNHANDLED;				\
 	}								\
@@ -277,14 +306,25 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 									\
 	p = &get_cpu_var(ftrace_event_seq);				\
 	trace_seq_init(p);						\
-	ret = trace_seq_printf(s, #call ": " print);			\
+	ret = trace_seq_printf(s, "%s: ", name);			\
+	if (ret)							\
+		ret = trace_seq_printf(s, print);			\
 	put_cpu();							\
 	if (!ret)							\
 		return TRACE_TYPE_PARTIAL_LINE;				\
 									\
 	return TRACE_TYPE_HANDLED;					\
 }
-	
+
+#undef DEFINE_EVENT
+#define DEFINE_EVENT(template, name, proto, args)			\
+static enum print_line_t						\
+ftrace_raw_output_##name(struct trace_iterator *iter, int flags)	\
+{									\
+	return ftrace_raw_output_id_##template(event_##name.id,		\
+					       #name, iter, flags);	\
+}
+
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 #undef __field_ext
@@ -318,8 +358,8 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 #undef __string
 #define __string(item, src) __dynamic_array(char, item, -1)
 
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
+#undef TRACE_EVENT_TEMPLATE
+#define TRACE_EVENT_TEMPLATE(call, proto, args, tstruct, func, print)	\
 static int								\
 ftrace_define_fields_##call(struct ftrace_event_call *event_call)	\
 {									\
@@ -335,6 +375,9 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call)	\
 	return ret;							\
 }
 
+#undef DEFINE_EVENT
+#define DEFINE_EVENT(template, name, proto, args)
+
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 /*
@@ -361,10 +404,10 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call)	\
 	__data_size += (len) * sizeof(type);
 
 #undef __string
-#define __string(item, src) __dynamic_array(char, item, strlen(src) + 1)       \
+#define __string(item, src) __dynamic_array(char, item, strlen(src) + 1)
 
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
+#undef TRACE_EVENT_TEMPLATE
+#define TRACE_EVENT_TEMPLATE(call, proto, args, tstruct, assign, print)	\
 static inline int ftrace_get_offsets_##call(				\
 	struct ftrace_data_offsets_##call *__data_offsets, proto)       \
 {									\
@@ -376,6 +419,9 @@ static inline int ftrace_get_offsets_##call(				\
 	return __data_size;						\
 }
 
+#undef DEFINE_EVENT
+#define DEFINE_EVENT(template, name, proto, args)
+
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 #ifdef CONFIG_EVENT_PROFILE
@@ -397,19 +443,22 @@ static inline int ftrace_get_offsets_##call(				\
  *
  */
 
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
+#undef TRACE_EVENT_TEMPLATE
+#define TRACE_EVENT_TEMPLATE(call, proto, args, tstruct, assign, print)
+
+#undef DEFINE_EVENT
+#define DEFINE_EVENT(template, name, proto, args)			\
 									\
-static void ftrace_profile_##call(proto);				\
+static void ftrace_profile_##name(proto);				\
 									\
-static int ftrace_profile_enable_##call(struct ftrace_event_call *unused)\
+static int ftrace_profile_enable_##name(struct ftrace_event_call *unused)\
 {									\
-	return register_trace_##call(ftrace_profile_##call);		\
+	return register_trace_##name(ftrace_profile_##name);		\
 }									\
 									\
-static void ftrace_profile_disable_##call(struct ftrace_event_call *unused)\
+static void ftrace_profile_disable_##name(struct ftrace_event_call *unused)\
 {									\
-	unregister_trace_##call(ftrace_profile_##call);			\
+	unregister_trace_##name(ftrace_profile_##name);			\
 }
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
@@ -550,15 +599,13 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *unused)\
 #define __assign_str(dst, src)						\
 	strcpy(__get_str(dst), src);
 
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
-									\
-static struct ftrace_event_call event_##call;				\
+#undef TRACE_EVENT_TEMPLATE
+#define TRACE_EVENT_TEMPLATE(call, proto, args, tstruct, assign, print)	\
 									\
-static void ftrace_raw_event_##call(proto)				\
+static void ftrace_raw_event_id_##call(struct ftrace_event_call *event_call, \
+				       proto)				\
 {									\
 	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
-	struct ftrace_event_call *event_call = &event_##call;		\
 	struct ring_buffer_event *event;				\
 	struct ftrace_raw_##call *entry;				\
 	struct ring_buffer *buffer;					\
@@ -572,7 +619,7 @@ static void ftrace_raw_event_##call(proto)				\
 	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
 									\
 	event = trace_current_buffer_lock_reserve(&buffer,		\
-				 event_##call.id,			\
+				 event_call->id,			\
 				 sizeof(*entry) + __data_size,		\
 				 irq_flags, pc);			\
 	if (!event)							\
@@ -587,6 +634,14 @@ static void ftrace_raw_event_##call(proto)				\
 	if (!filter_current_check_discard(buffer, event_call, entry, event)) \
 		trace_nowake_buffer_unlock_commit(buffer,		\
 						  event, irq_flags, pc); \
+}
+
+#undef DEFINE_EVENT
+#define DEFINE_EVENT(template, call, proto, args)			\
+									\
+static void ftrace_raw_event_##call(proto)				\
+{									\
+	ftrace_raw_event_id_##template(&event_##call, args);		\
 }									\
 									\
 static int ftrace_raw_reg_event_##call(struct ftrace_event_call *unused)\
@@ -630,8 +685,8 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.raw_init		= ftrace_raw_init_event_##call,		\
 	.regfunc		= ftrace_raw_reg_event_##call,		\
 	.unregfunc		= ftrace_raw_unreg_event_##call,	\
-	.show_format		= ftrace_format_##call,			\
-	.define_fields		= ftrace_define_fields_##call,		\
+	.show_format		= ftrace_format_##template,		\
+	.define_fields		= ftrace_define_fields_##template,	\
 	_TRACE_PROFILE_INIT(call)					\
 }
 
@@ -719,14 +774,15 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #undef __perf_count
 #define __perf_count(c) __count = (c)
 
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
-static void ftrace_profile_##call(proto)				\
+#undef TRACE_EVENT_TEMPLATE
+#define TRACE_EVENT_TEMPLATE(call, proto, args, tstruct, assign, print)	\
+static void								\
+ftrace_profile_templ_##call(struct ftrace_event_call *event_call,	\
+			    proto)					\
 {									\
 	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
 	extern int perf_swevent_get_recursion_context(void);		\
 	extern void perf_swevent_put_recursion_context(int rctx);	\
-	struct ftrace_event_call *event_call = &event_##call;		\
 	extern void perf_tp_event(int, u64, u64, void *, int);		\
 	struct ftrace_raw_##call *entry;				\
 	u64 __addr = 0, __count = 1;					\
@@ -789,6 +845,15 @@ end_recursion:								\
 									\
 }
 
+#undef DEFINE_EVENT
+#define DEFINE_EVENT(template, call, proto, args)		\
+static void ftrace_profile_##call(proto)			\
+{								\
+	struct ftrace_event_call *event_call = &event_##call;	\
+								\
+	ftrace_profile_templ_##template(event_call, args);	\
+}
+
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 #endif /* CONFIG_EVENT_PROFILE */
 
-- 
cgit v1.2.3


From e5bc9721684e9412f3e0465222f317c362a8ab47 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 18 Nov 2009 20:36:26 -0500
Subject: tracing: Create new DEFINE_EVENT_PRINT

After creating the TRACE_EVENT_TEMPLATE I started to look at other
trace points to see what duplication was made. I noticed that there
are several trace points where they are almost identical except for
the name and the output format. Since TRACE_EVENT_TEMPLATE was successful
in bringing down the size of trace events, I added a DEFINE_EVENT_PRINT.

DEFINE_EVENT_PRINT is used just like DEFINE_EVENT is. That is, the
DEFINE_EVENT_PRINT also uses a TRACE_EVENT_TEMPLATE, but it allows the
developer to overwrite the print format. If there are two or more
TRACE_EVENTS that are identical except for the name and print, then
they can be converted to use a TRACE_EVENT_TEMPLATE. Since the
TRACE_EVENT_TEMPLATE already does the print output, the first trace event
would have its print format held in the TRACE_EVENT_TEMPLATE and
be defined with a DEFINE_EVENT. The rest will use the DEFINE_EVENT_PRINT
and override the print format.

Converting the sched trace points to both DEFINE_EVENT and
DEFINE_EVENT_PRINT. Five were converted to DEFINE_EVENT and two were
converted to DEFINE_EVENT_PRINT.

I was able to get the following:

$ size kernel/sched.o-*
   text	   data	    bss	    dec	    hex	filename
  79299	   6776	   2520	  88595	  15a13	kernel/sched.o-notrace
 101941	  11896	   2584	 116421	  1c6c5	kernel/sched.o-templ
 104779	  11896	   2584	 119259	  1d1db	kernel/sched.o-trace

sched.o-notrace is the scheduler compiled with no trace points.
sched.o-templ is with the use of DEFINE_EVENT and DEFINE_EVENT_PRINT
sched.o-trace is the current trace events.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/tracepoint.h   |   2 +
 include/trace/define_trace.h |   5 ++
 include/trace/ftrace.h       | 123 +++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 126 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 88a5b5a809ec..7063383cca13 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -283,6 +283,8 @@ static inline void tracepoint_synchronize_unregister(void)
 #define TRACE_EVENT_TEMPLATE(name, proto, args, tstruct, assign, print)
 #define DEFINE_EVENT(template, name, proto, args)		\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
+	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 
 #define TRACE_EVENT(name, proto, args, struct, assign, print)	\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index 244985814a43..5d7d855ae21e 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -35,6 +35,10 @@
 #define DEFINE_EVENT(template, name, proto, args) \
 	DEFINE_TRACE(name)
 
+#undef DEFINE_EVENT_PRINT
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
+	DEFINE_TRACE(name)
+
 #undef DECLARE_TRACE
 #define DECLARE_TRACE(name, proto, args)	\
 	DEFINE_TRACE(name)
@@ -69,6 +73,7 @@
 #undef TRACE_EVENT_FN
 #undef TRACE_EVENT_TEMPLATE
 #undef DEFINE_EVENT
+#undef DEFINE_EVENT_PRINT
 #undef TRACE_HEADER_MULTI_READ
 
 /* Only undef what we defined in this file */
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 2969f65d8002..b0461772bc8d 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -67,6 +67,10 @@
 #define DEFINE_EVENT(template, name, proto, args)	\
 	static struct ftrace_event_call event_##name
 
+#undef DEFINE_EVENT_PRINT
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
+	DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+
 #undef __cpparg
 #define __cpparg(arg...) arg
 
@@ -120,6 +124,10 @@
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, name, proto, args)
 
+#undef DEFINE_EVENT_PRINT
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
+	DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 /*
@@ -198,15 +206,28 @@
 #undef TRACE_EVENT_TEMPLATE
 #define TRACE_EVENT_TEMPLATE(call, proto, args, tstruct, func, print)	\
 static int								\
-ftrace_format_##call(struct ftrace_event_call *unused,			\
-		      struct trace_seq *s)				\
+ftrace_format_setup_##call(struct ftrace_event_call *unused,		\
+			   struct trace_seq *s)				\
 {									\
 	struct ftrace_raw_##call field __attribute__((unused));		\
 	int ret = 0;							\
 									\
 	tstruct;							\
 									\
-	trace_seq_printf(s, "\nprint fmt: " print);			\
+	return ret;							\
+}									\
+									\
+static int								\
+ftrace_format_##call(struct ftrace_event_call *unused,			\
+		     struct trace_seq *s)				\
+{									\
+	int ret = 0;							\
+									\
+	ret = ftrace_format_setup_##call(unused, s);			\
+	if (!ret)							\
+		return ret;						\
+									\
+	ret = trace_seq_printf(s, "\nprint fmt: " print);		\
 									\
 	return ret;							\
 }
@@ -214,6 +235,23 @@ ftrace_format_##call(struct ftrace_event_call *unused,			\
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, name, proto, args)
 
+#undef DEFINE_EVENT_PRINT
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print)		\
+static int								\
+ftrace_format_##name(struct ftrace_event_call *unused,			\
+		      struct trace_seq *s)				\
+{									\
+	int ret = 0;							\
+									\
+	ret = ftrace_format_setup_##template(unused, s);		\
+	if (!ret)							\
+		return ret;						\
+									\
+	trace_seq_printf(s, "\nprint fmt: " print);			\
+									\
+	return ret;							\
+}
+
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 /*
@@ -325,6 +363,38 @@ ftrace_raw_output_##name(struct trace_iterator *iter, int flags)	\
 					       #name, iter, flags);	\
 }
 
+#undef DEFINE_EVENT_PRINT
+#define DEFINE_EVENT_PRINT(template, call, proto, args, print)		\
+static enum print_line_t						\
+ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
+{									\
+	struct trace_seq *s = &iter->seq;				\
+	struct ftrace_raw_##template *field;				\
+	struct trace_entry *entry;					\
+	struct trace_seq *p;						\
+	int ret;							\
+									\
+	entry = iter->ent;						\
+									\
+	if (entry->type != event_##call.id) {				\
+		WARN_ON_ONCE(1);					\
+		return TRACE_TYPE_UNHANDLED;				\
+	}								\
+									\
+	field = (typeof(field))entry;					\
+									\
+	p = &get_cpu_var(ftrace_event_seq);				\
+	trace_seq_init(p);						\
+	ret = trace_seq_printf(s, "%s: ", #call);			\
+	if (ret)							\
+		ret = trace_seq_printf(s, print);			\
+	put_cpu();							\
+	if (!ret)							\
+		return TRACE_TYPE_PARTIAL_LINE;				\
+									\
+	return TRACE_TYPE_HANDLED;					\
+}
+
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 #undef __field_ext
@@ -378,6 +448,10 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call)	\
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, name, proto, args)
 
+#undef DEFINE_EVENT_PRINT
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
+	DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 /*
@@ -422,6 +496,10 @@ static inline int ftrace_get_offsets_##call(				\
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, name, proto, args)
 
+#undef DEFINE_EVENT_PRINT
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
+	DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 #ifdef CONFIG_EVENT_PROFILE
@@ -461,6 +539,10 @@ static void ftrace_profile_disable_##name(struct ftrace_event_call *unused)\
 	unregister_trace_##name(ftrace_profile_##name);			\
 }
 
+#undef DEFINE_EVENT_PRINT
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
+	DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 #endif
@@ -674,7 +756,19 @@ static int ftrace_raw_init_event_##call(struct ftrace_event_call *unused)\
 	event_##call.id = id;						\
 	INIT_LIST_HEAD(&event_##call.fields);				\
 	return 0;							\
-}									\
+}
+
+#undef DEFINE_EVENT_PRINT
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
+	DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+#undef TRACE_EVENT_TEMPLATE
+#define TRACE_EVENT_TEMPLATE(call, proto, args, tstruct, assign, print)
+
+#undef DEFINE_EVENT
+#define DEFINE_EVENT(template, call, proto, args)			\
 									\
 static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
@@ -690,6 +784,23 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	_TRACE_PROFILE_INIT(call)					\
 }
 
+#undef DEFINE_EVENT_PRINT
+#define DEFINE_EVENT_PRINT(template, call, proto, args, print)		\
+									\
+static struct ftrace_event_call __used					\
+__attribute__((__aligned__(4)))						\
+__attribute__((section("_ftrace_events"))) event_##call = {		\
+	.name			= #call,				\
+	.system			= __stringify(TRACE_SYSTEM),		\
+	.event			= &ftrace_event_type_##call,		\
+	.raw_init		= ftrace_raw_init_event_##call,		\
+	.regfunc		= ftrace_raw_reg_event_##call,		\
+	.unregfunc		= ftrace_raw_unreg_event_##call,	\
+	.show_format		= ftrace_format_##call,			\
+	.define_fields		= ftrace_define_fields_##template,	\
+	_TRACE_PROFILE_INIT(call)					\
+}
+
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 /*
@@ -854,6 +965,10 @@ static void ftrace_profile_##call(proto)			\
 	ftrace_profile_templ_##template(event_call, args);	\
 }
 
+#undef DEFINE_EVENT_PRINT
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
+	DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 #endif /* CONFIG_EVENT_PROFILE */
 
-- 
cgit v1.2.3


From 35a8a3fdcd4f973a5430e868f2f2a5c363803a5b Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 25 Nov 2009 17:50:00 +0100
Subject: drbd: moved CN_IDX_DRBD and CN_VAL_DRBD to the right file

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 include/linux/connector.h | 2 ++
 include/linux/drbd.h      | 7 -------
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/connector.h b/include/linux/connector.h
index 3a14615fd35c..72ba63eb83c5 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -43,6 +43,8 @@
 #define CN_DST_VAL			0x1
 #define CN_IDX_DM			0x7	/* Device Mapper */
 #define CN_VAL_DM_USERSPACE_LOG		0x1
+#define CN_IDX_DRBD			0x8
+#define CN_VAL_DRBD			0x1
 
 #define CN_NETLINK_USERS		8
 
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 99a4d76694ed..e84f4733cb55 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -322,13 +322,6 @@ enum drbd_timeout_flag {
 #define DRBD_NL_CREATE_DEVICE 0x01
 #define DRBD_NL_SET_DEFAULTS  0x02
 
-/* The following line should be moved over to linux/connector.h
- * when the time comes */
-#ifndef CN_IDX_DRBD
-# define CN_IDX_DRBD			0x4
-/* Ubuntu "intrepid ibex" release defined CN_IDX_DRBD as 0x6 */
-#endif
-#define CN_VAL_DRBD			0x1
 
 /* For searching a vacant cn_idx value */
 #define CN_IDX_STEP			6977
-- 
cgit v1.2.3


From 4e242d1616781f9f1f0b01abf878700b259cd8b5 Mon Sep 17 00:00:00 2001
From: Martin Willi <martin@strongswan.org>
Date: Wed, 25 Nov 2009 00:29:51 +0000
Subject: xfrm: Define new XFRM netlink auth attribute with specified
 truncation bits

The new XFRMA_ALG_AUTH_TRUNC attribute taking a xfrm_algo_auth as
argument allows the installation of authentication algorithms with
a truncation length specified in userspace, i.e. SHA256 with 128 bit
instead of 96 bit truncation.

Signed-off-by: Martin Willi <martin@strongswan.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/xfrm.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 3246f0e66bcc..29e04beb1fc9 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -90,6 +90,13 @@ struct xfrm_algo {
 	char		alg_key[0];
 };
 
+struct xfrm_algo_auth {
+	char		alg_name[64];
+	unsigned int	alg_key_len;    /* in bits */
+	unsigned int	alg_trunc_len;  /* in bits */
+	char		alg_key[0];
+};
+
 struct xfrm_algo_aead {
 	char		alg_name[64];
 	unsigned int	alg_key_len;	/* in bits */
@@ -274,6 +281,7 @@ enum xfrm_attr_type_t {
 	XFRMA_MIGRATE,
 	XFRMA_ALG_AEAD,		/* struct xfrm_algo_aead */
 	XFRMA_KMADDRESS,        /* struct xfrm_user_kmaddress */
+	XFRMA_ALG_AUTH_TRUNC,	/* struct xfrm_algo_auth */
 	__XFRMA_MAX
 
 #define XFRMA_MAX (__XFRMA_MAX - 1)
-- 
cgit v1.2.3


From 091ad3658e3c76c5fb05f65bfb64a0246f8f31b5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 26 Nov 2009 09:04:55 +0100
Subject: events: Rename TRACE_EVENT_TEMPLATE() to DECLARE_EVENT_CLASS()

It is not quite obvious at first sight what TRACE_EVENT_TEMPLATE
does: does it define an event as well beyond defining a template?

To clarify this, rename it to DECLARE_EVENT_CLASS, which follows
the various 'DECLARE_*()' idioms we already have in the kernel:

  DECLARE_EVENT_CLASS(class)

    DEFINE_EVENT(class, event1)
    DEFINE_EVENT(class, event2)
    DEFINE_EVENT(class, event3)

To complete this logic we should also rename TRACE_EVENT() to:

  DEFINE_SINGLE_EVENT(single_event)

... but in a more quiet moment of the kernel cycle.

Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4B0E286A.2000405@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/tracepoint.h   |  2 +-
 include/trace/define_trace.h |  2 +-
 include/trace/events/sched.h |  6 +++---
 include/trace/ftrace.h       | 46 ++++++++++++++++++++++----------------------
 4 files changed, 28 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 7063383cca13..f59604ed0ec6 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -280,7 +280,7 @@ static inline void tracepoint_synchronize_unregister(void)
  * TRACE_EVENT_FN to perform any (un)registration work.
  */
 
-#define TRACE_EVENT_TEMPLATE(name, proto, args, tstruct, assign, print)
+#define DECLARE_EVENT_CLASS(name, proto, args, tstruct, assign, print)
 #define DEFINE_EVENT(template, name, proto, args)		\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 #define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index 5d7d855ae21e..5acfb1eb4df9 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -71,7 +71,7 @@
 
 #undef TRACE_EVENT
 #undef TRACE_EVENT_FN
-#undef TRACE_EVENT_TEMPLATE
+#undef DECLARE_EVENT_CLASS
 #undef DEFINE_EVENT
 #undef DEFINE_EVENT_PRINT
 #undef TRACE_HEADER_MULTI_READ
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 238f74b58486..5ce795021851 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -83,7 +83,7 @@ TRACE_EVENT(sched_wait_task,
  * (NOTE: the 'rq' argument is not used by generic trace events,
  *        but used by the latency tracer plugin. )
  */
-TRACE_EVENT_TEMPLATE(sched_wakeup_template,
+DECLARE_EVENT_CLASS(sched_wakeup_template,
 
 	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
 
@@ -197,7 +197,7 @@ TRACE_EVENT(sched_migrate_task,
 		  __entry->orig_cpu, __entry->dest_cpu)
 );
 
-TRACE_EVENT_TEMPLATE(sched_process_template,
+DECLARE_EVENT_CLASS(sched_process_template,
 
 	TP_PROTO(struct task_struct *p),
 
@@ -316,7 +316,7 @@ TRACE_EVENT(sched_signal_send,
  * XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE
  *     adding sched_stat support to SCHED_FIFO/RR would be welcome.
  */
-TRACE_EVENT_TEMPLATE(sched_stat_template,
+DECLARE_EVENT_CLASS(sched_stat_template,
 
 	TP_PROTO(struct task_struct *tsk, u64 delay),
 
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index b0461772bc8d..2c9c073e45ad 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -19,17 +19,17 @@
 #include <linux/ftrace_event.h>
 
 /*
- * TRACE_EVENT_TEMPLATE can be used to add a generic function
+ * DECLARE_EVENT_CLASS can be used to add a generic function
  * handlers for events. That is, if all events have the same
  * parameters and just have distinct trace points.
  * Each tracepoint can be defined with DEFINE_EVENT and that
- * will map the TRACE_EVENT_TEMPLATE to the tracepoint.
+ * will map the DECLARE_EVENT_CLASS to the tracepoint.
  *
  * TRACE_EVENT is a one to one mapping between tracepoint and template.
  */
 #undef TRACE_EVENT
 #define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
-	TRACE_EVENT_TEMPLATE(name,			       \
+	DECLARE_EVENT_CLASS(name,			       \
 			     PARAMS(proto),		       \
 			     PARAMS(args),		       \
 			     PARAMS(tstruct),		       \
@@ -56,8 +56,8 @@
 #undef TP_STRUCT__entry
 #define TP_STRUCT__entry(args...) args
 
-#undef TRACE_EVENT_TEMPLATE
-#define TRACE_EVENT_TEMPLATE(name, proto, args, tstruct, assign, print)	\
+#undef DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_CLASS(name, proto, args, tstruct, assign, print)	\
 	struct ftrace_raw_##name {					\
 		struct trace_entry	ent;				\
 		tstruct							\
@@ -115,8 +115,8 @@
 #undef __string
 #define __string(item, src) __dynamic_array(char, item, -1)
 
-#undef TRACE_EVENT_TEMPLATE
-#define TRACE_EVENT_TEMPLATE(call, proto, args, tstruct, assign, print)	\
+#undef DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
 	struct ftrace_data_offsets_##call {				\
 		tstruct;						\
 	};
@@ -203,8 +203,8 @@
 #undef TP_perf_assign
 #define TP_perf_assign(args...)
 
-#undef TRACE_EVENT_TEMPLATE
-#define TRACE_EVENT_TEMPLATE(call, proto, args, tstruct, func, print)	\
+#undef DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, func, print)	\
 static int								\
 ftrace_format_setup_##call(struct ftrace_event_call *unused,		\
 			   struct trace_seq *s)				\
@@ -321,8 +321,8 @@ ftrace_format_##name(struct ftrace_event_call *unused,			\
 		ftrace_print_symbols_seq(p, value, symbols);		\
 	})
 
-#undef TRACE_EVENT_TEMPLATE
-#define TRACE_EVENT_TEMPLATE(call, proto, args, tstruct, assign, print)	\
+#undef DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
 static enum print_line_t						\
 ftrace_raw_output_id_##call(int event_id, const char *name,		\
 			    struct trace_iterator *iter, int flags)	\
@@ -428,8 +428,8 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 #undef __string
 #define __string(item, src) __dynamic_array(char, item, -1)
 
-#undef TRACE_EVENT_TEMPLATE
-#define TRACE_EVENT_TEMPLATE(call, proto, args, tstruct, func, print)	\
+#undef DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, func, print)	\
 static int								\
 ftrace_define_fields_##call(struct ftrace_event_call *event_call)	\
 {									\
@@ -480,8 +480,8 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call)	\
 #undef __string
 #define __string(item, src) __dynamic_array(char, item, strlen(src) + 1)
 
-#undef TRACE_EVENT_TEMPLATE
-#define TRACE_EVENT_TEMPLATE(call, proto, args, tstruct, assign, print)	\
+#undef DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
 static inline int ftrace_get_offsets_##call(				\
 	struct ftrace_data_offsets_##call *__data_offsets, proto)       \
 {									\
@@ -521,8 +521,8 @@ static inline int ftrace_get_offsets_##call(				\
  *
  */
 
-#undef TRACE_EVENT_TEMPLATE
-#define TRACE_EVENT_TEMPLATE(call, proto, args, tstruct, assign, print)
+#undef DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)
 
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, name, proto, args)			\
@@ -681,8 +681,8 @@ static void ftrace_profile_disable_##name(struct ftrace_event_call *unused)\
 #define __assign_str(dst, src)						\
 	strcpy(__get_str(dst), src);
 
-#undef TRACE_EVENT_TEMPLATE
-#define TRACE_EVENT_TEMPLATE(call, proto, args, tstruct, assign, print)	\
+#undef DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
 									\
 static void ftrace_raw_event_id_##call(struct ftrace_event_call *event_call, \
 				       proto)				\
@@ -764,8 +764,8 @@ static int ftrace_raw_init_event_##call(struct ftrace_event_call *unused)\
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
-#undef TRACE_EVENT_TEMPLATE
-#define TRACE_EVENT_TEMPLATE(call, proto, args, tstruct, assign, print)
+#undef DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)
 
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, call, proto, args)			\
@@ -885,8 +885,8 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #undef __perf_count
 #define __perf_count(c) __count = (c)
 
-#undef TRACE_EVENT_TEMPLATE
-#define TRACE_EVENT_TEMPLATE(call, proto, args, tstruct, assign, print)	\
+#undef DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
 static void								\
 ftrace_profile_templ_##call(struct ftrace_event_call *event_call,	\
 			    proto)					\
-- 
cgit v1.2.3


From 2d4dc890b5c8fabd818a8586607e6843c4375e62 Mon Sep 17 00:00:00 2001
From: Ilya Loginov <isloginov@gmail.com>
Date: Thu, 26 Nov 2009 09:16:19 +0100
Subject: block: add helpers to run flush_dcache_page() against a bio and a
 request's pages

Mtdblock driver doesn't call flush_dcache_page for pages in request.  So,
this causes problems on architectures where the icache doesn't fill from
the dcache or with dcache aliases.  The patch fixes this.

The ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE symbol was introduced to avoid
pointless empty cache-thrashing loops on architectures for which
flush_dcache_page() is a no-op.  Every architecture was provided with this
flush pages on architectires where ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE is
equal 1 or do nothing otherwise.

See "fix mtd_blkdevs problem with caches on some architectures" discussion
on LKML for more information.

Signed-off-by: Ilya Loginov <isloginov@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Peter Horton <phorton@bitbox.co.uk>
Cc: "Ed L. Cashin" <ecashin@coraid.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 arch/alpha/include/asm/cacheflush.h      |  1 +
 arch/arm/include/asm/cacheflush.h        |  1 +
 arch/avr32/include/asm/cacheflush.h      |  1 +
 arch/blackfin/include/asm/cacheflush.h   |  2 ++
 arch/cris/include/asm/cacheflush.h       |  1 +
 arch/frv/include/asm/cacheflush.h        |  1 +
 arch/h8300/include/asm/cacheflush.h      |  1 +
 arch/ia64/include/asm/cacheflush.h       |  1 +
 arch/m32r/include/asm/cacheflush.h       |  3 +++
 arch/m68k/include/asm/cacheflush_mm.h    |  1 +
 arch/m68k/include/asm/cacheflush_no.h    |  1 +
 arch/microblaze/include/asm/cacheflush.h |  1 +
 arch/mips/include/asm/cacheflush.h       |  1 +
 arch/mn10300/include/asm/cacheflush.h    |  1 +
 arch/parisc/include/asm/cacheflush.h     |  1 +
 arch/powerpc/include/asm/cacheflush.h    |  1 +
 arch/s390/include/asm/cacheflush.h       |  1 +
 arch/score/include/asm/cacheflush.h      |  1 +
 arch/sh/include/asm/cacheflush.h         |  1 +
 arch/sparc/include/asm/cacheflush_32.h   |  1 +
 arch/sparc/include/asm/cacheflush_64.h   |  1 +
 arch/x86/include/asm/cacheflush.h        |  1 +
 arch/xtensa/include/asm/cacheflush.h     |  1 +
 block/blk-core.c                         | 19 +++++++++++++++++++
 drivers/mtd/mtd_blkdevs.c                |  2 ++
 fs/bio.c                                 | 12 ++++++++++++
 include/asm-generic/cacheflush.h         |  1 +
 include/linux/bio.h                      | 12 ++++++++++++
 include/linux/blkdev.h                   | 11 +++++++++++
 29 files changed, 83 insertions(+)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/cacheflush.h b/arch/alpha/include/asm/cacheflush.h
index b686cc7fc44e..01d71e1c8a9e 100644
--- a/arch/alpha/include/asm/cacheflush.h
+++ b/arch/alpha/include/asm/cacheflush.h
@@ -9,6 +9,7 @@
 #define flush_cache_dup_mm(mm)			do { } while (0)
 #define flush_cache_range(vma, start, end)	do { } while (0)
 #define flush_cache_page(vma, vmaddr, pfn)	do { } while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)			do { } while (0)
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)	do { } while (0)
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index fd03fb63a332..247b7b0adc2a 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -408,6 +408,7 @@ extern void flush_ptrace_access(struct vm_area_struct *vma, struct page *page,
  * about to change to user space.  This is the same method as used on SPARC64.
  * See update_mmu_cache for the user space part.
  */
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 extern void flush_dcache_page(struct page *);
 
 extern void __flush_dcache_page(struct address_space *mapping, struct page *page);
diff --git a/arch/avr32/include/asm/cacheflush.h b/arch/avr32/include/asm/cacheflush.h
index 670674749b20..96e53820bbbd 100644
--- a/arch/avr32/include/asm/cacheflush.h
+++ b/arch/avr32/include/asm/cacheflush.h
@@ -107,6 +107,7 @@ extern void flush_icache_page(struct vm_area_struct *vma, struct page *page);
  * do something here, but only for certain configurations.  No such
  * configurations exist at this time.
  */
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)			do { } while (0)
 #define flush_dcache_mmap_lock(page)		do { } while (0)
 #define flush_dcache_mmap_unlock(page)		do { } while (0)
diff --git a/arch/blackfin/include/asm/cacheflush.h b/arch/blackfin/include/asm/cacheflush.h
index af03a36c7a4e..417eaac7fe99 100644
--- a/arch/blackfin/include/asm/cacheflush.h
+++ b/arch/blackfin/include/asm/cacheflush.h
@@ -68,9 +68,11 @@ do { memcpy(dst, src, len);						\
 #endif
 #if defined(CONFIG_BFIN_EXTMEM_WRITEBACK) || defined(CONFIG_BFIN_L2_WRITEBACK)
 # define flush_dcache_range(start,end)		blackfin_dcache_flush_range((start), (end))
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 # define flush_dcache_page(page)		blackfin_dflush_page(page_address(page))
 #else
 # define flush_dcache_range(start,end)		do { } while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 # define flush_dcache_page(page)		do { } while (0)
 #endif
 
diff --git a/arch/cris/include/asm/cacheflush.h b/arch/cris/include/asm/cacheflush.h
index cf60e3f69f8d..36795bca605e 100644
--- a/arch/cris/include/asm/cacheflush.h
+++ b/arch/cris/include/asm/cacheflush.h
@@ -12,6 +12,7 @@
 #define flush_cache_dup_mm(mm)			do { } while (0)
 #define flush_cache_range(vma, start, end)	do { } while (0)
 #define flush_cache_page(vma, vmaddr, pfn)	do { } while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)			do { } while (0)
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)	do { } while (0)
diff --git a/arch/frv/include/asm/cacheflush.h b/arch/frv/include/asm/cacheflush.h
index 432a69e7f3d4..edbac54ae015 100644
--- a/arch/frv/include/asm/cacheflush.h
+++ b/arch/frv/include/asm/cacheflush.h
@@ -47,6 +47,7 @@ static inline void __flush_cache_all(void)
 }
 
 /* dcache/icache coherency... */
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 #ifdef CONFIG_MMU
 extern void flush_dcache_page(struct page *page);
 #else
diff --git a/arch/h8300/include/asm/cacheflush.h b/arch/h8300/include/asm/cacheflush.h
index 5ffdca217b95..4cf2df20c1ce 100644
--- a/arch/h8300/include/asm/cacheflush.h
+++ b/arch/h8300/include/asm/cacheflush.h
@@ -15,6 +15,7 @@
 #define	flush_cache_dup_mm(mm)		do { } while (0)
 #define	flush_cache_range(vma,a,b)
 #define	flush_cache_page(vma,p,pfn)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define	flush_dcache_page(page)
 #define	flush_dcache_mmap_lock(mapping)
 #define	flush_dcache_mmap_unlock(mapping)
diff --git a/arch/ia64/include/asm/cacheflush.h b/arch/ia64/include/asm/cacheflush.h
index c8ce2719fee8..429eefc93ee7 100644
--- a/arch/ia64/include/asm/cacheflush.h
+++ b/arch/ia64/include/asm/cacheflush.h
@@ -25,6 +25,7 @@
 #define flush_cache_vmap(start, end)		do { } while (0)
 #define flush_cache_vunmap(start, end)		do { } while (0)
 
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 #define flush_dcache_page(page)			\
 do {						\
 	clear_bit(PG_arch_1, &(page)->flags);	\
diff --git a/arch/m32r/include/asm/cacheflush.h b/arch/m32r/include/asm/cacheflush.h
index 78587c958146..8e8e04516c39 100644
--- a/arch/m32r/include/asm/cacheflush.h
+++ b/arch/m32r/include/asm/cacheflush.h
@@ -12,6 +12,7 @@ extern void _flush_cache_copyback_all(void);
 #define flush_cache_dup_mm(mm)			do { } while (0)
 #define flush_cache_range(vma, start, end)	do { } while (0)
 #define flush_cache_page(vma, vmaddr, pfn)	do { } while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)			do { } while (0)
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)	do { } while (0)
@@ -33,6 +34,7 @@ extern void smp_flush_cache_all(void);
 #define flush_cache_dup_mm(mm)			do { } while (0)
 #define flush_cache_range(vma, start, end)	do { } while (0)
 #define flush_cache_page(vma, vmaddr, pfn)	do { } while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)			do { } while (0)
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)	do { } while (0)
@@ -46,6 +48,7 @@ extern void smp_flush_cache_all(void);
 #define flush_cache_dup_mm(mm)			do { } while (0)
 #define flush_cache_range(vma, start, end)	do { } while (0)
 #define flush_cache_page(vma, vmaddr, pfn)	do { } while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)			do { } while (0)
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)	do { } while (0)
diff --git a/arch/m68k/include/asm/cacheflush_mm.h b/arch/m68k/include/asm/cacheflush_mm.h
index 16bf375fdbe1..73de7c89d8e0 100644
--- a/arch/m68k/include/asm/cacheflush_mm.h
+++ b/arch/m68k/include/asm/cacheflush_mm.h
@@ -128,6 +128,7 @@ static inline void __flush_page_to_ram(void *vaddr)
 	}
 }
 
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 #define flush_dcache_page(page)		__flush_page_to_ram(page_address(page))
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)	do { } while (0)
diff --git a/arch/m68k/include/asm/cacheflush_no.h b/arch/m68k/include/asm/cacheflush_no.h
index c65f00a94553..89f195656be7 100644
--- a/arch/m68k/include/asm/cacheflush_no.h
+++ b/arch/m68k/include/asm/cacheflush_no.h
@@ -12,6 +12,7 @@
 #define flush_cache_range(vma, start, end)	__flush_cache_all()
 #define flush_cache_page(vma, vmaddr)		do { } while (0)
 #define flush_dcache_range(start,len)		__flush_cache_all()
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)			do { } while (0)
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)	do { } while (0)
diff --git a/arch/microblaze/include/asm/cacheflush.h b/arch/microblaze/include/asm/cacheflush.h
index f989d6aad648..088076e657b3 100644
--- a/arch/microblaze/include/asm/cacheflush.h
+++ b/arch/microblaze/include/asm/cacheflush.h
@@ -37,6 +37,7 @@
 #define flush_cache_page(vma, vmaddr, pfn)	do { } while (0)
 
 #define flush_dcache_range(start, end)	__invalidate_dcache_range(start, end)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)			do { } while (0)
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)	do { } while (0)
diff --git a/arch/mips/include/asm/cacheflush.h b/arch/mips/include/asm/cacheflush.h
index 03b1d69b142f..40bb9fde205f 100644
--- a/arch/mips/include/asm/cacheflush.h
+++ b/arch/mips/include/asm/cacheflush.h
@@ -38,6 +38,7 @@ extern void (*flush_cache_range)(struct vm_area_struct *vma,
 extern void (*flush_cache_page)(struct vm_area_struct *vma, unsigned long page, unsigned long pfn);
 extern void __flush_dcache_page(struct page *page);
 
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 static inline void flush_dcache_page(struct page *page)
 {
 	if (cpu_has_dc_aliases || !cpu_has_ic_fills_f_dc)
diff --git a/arch/mn10300/include/asm/cacheflush.h b/arch/mn10300/include/asm/cacheflush.h
index 1a55d61f0d06..29e692f7f030 100644
--- a/arch/mn10300/include/asm/cacheflush.h
+++ b/arch/mn10300/include/asm/cacheflush.h
@@ -26,6 +26,7 @@
 #define flush_cache_page(vma, vmaddr, pfn)	do {} while (0)
 #define flush_cache_vmap(start, end)		do {} while (0)
 #define flush_cache_vunmap(start, end)		do {} while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)			do {} while (0)
 #define flush_dcache_mmap_lock(mapping)		do {} while (0)
 #define flush_dcache_mmap_unlock(mapping)	do {} while (0)
diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h
index 724395143f26..7a73b615c23d 100644
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -42,6 +42,7 @@ void flush_cache_mm(struct mm_struct *mm);
 #define flush_cache_vmap(start, end)		flush_cache_all()
 #define flush_cache_vunmap(start, end)		flush_cache_all()
 
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 extern void flush_dcache_page(struct page *page);
 
 #define flush_dcache_mmap_lock(mapping) \
diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h
index ba667a383b8c..ab9e402518e8 100644
--- a/arch/powerpc/include/asm/cacheflush.h
+++ b/arch/powerpc/include/asm/cacheflush.h
@@ -25,6 +25,7 @@
 #define flush_cache_vmap(start, end)		do { } while (0)
 #define flush_cache_vunmap(start, end)		do { } while (0)
 
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 extern void flush_dcache_page(struct page *page);
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)	do { } while (0)
diff --git a/arch/s390/include/asm/cacheflush.h b/arch/s390/include/asm/cacheflush.h
index 49d5af916d01..405cc97c6249 100644
--- a/arch/s390/include/asm/cacheflush.h
+++ b/arch/s390/include/asm/cacheflush.h
@@ -10,6 +10,7 @@
 #define flush_cache_dup_mm(mm)			do { } while (0)
 #define flush_cache_range(vma, start, end)	do { } while (0)
 #define flush_cache_page(vma, vmaddr, pfn)	do { } while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)			do { } while (0)
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)	do { } while (0)
diff --git a/arch/score/include/asm/cacheflush.h b/arch/score/include/asm/cacheflush.h
index 07cc8fc457cd..caaba24036e3 100644
--- a/arch/score/include/asm/cacheflush.h
+++ b/arch/score/include/asm/cacheflush.h
@@ -16,6 +16,7 @@ extern void flush_icache_range(unsigned long start, unsigned long end);
 extern void flush_dcache_range(unsigned long start, unsigned long end);
 
 #define flush_cache_dup_mm(mm)			do {} while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)			do {} while (0)
 #define flush_dcache_mmap_lock(mapping)		do {} while (0)
 #define flush_dcache_mmap_unlock(mapping)	do {} while (0)
diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h
index c29918f3c819..dda96eb3e7c0 100644
--- a/arch/sh/include/asm/cacheflush.h
+++ b/arch/sh/include/asm/cacheflush.h
@@ -42,6 +42,7 @@ extern void flush_cache_page(struct vm_area_struct *vma,
 				unsigned long addr, unsigned long pfn);
 extern void flush_cache_range(struct vm_area_struct *vma,
 				 unsigned long start, unsigned long end);
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 extern void flush_dcache_page(struct page *page);
 extern void flush_icache_range(unsigned long start, unsigned long end);
 extern void flush_icache_page(struct vm_area_struct *vma,
diff --git a/arch/sparc/include/asm/cacheflush_32.h b/arch/sparc/include/asm/cacheflush_32.h
index 68ac10910271..2e468773f250 100644
--- a/arch/sparc/include/asm/cacheflush_32.h
+++ b/arch/sparc/include/asm/cacheflush_32.h
@@ -75,6 +75,7 @@ BTFIXUPDEF_CALL(void, flush_sig_insns, struct mm_struct *, unsigned long)
 
 extern void sparc_flush_page_to_ram(struct page *page);
 
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 #define flush_dcache_page(page)			sparc_flush_page_to_ram(page)
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)	do { } while (0)
diff --git a/arch/sparc/include/asm/cacheflush_64.h b/arch/sparc/include/asm/cacheflush_64.h
index c43321729b3b..b95384033e89 100644
--- a/arch/sparc/include/asm/cacheflush_64.h
+++ b/arch/sparc/include/asm/cacheflush_64.h
@@ -37,6 +37,7 @@ extern void flush_dcache_page_all(struct mm_struct *mm, struct page *page);
 #endif
 
 extern void __flush_dcache_range(unsigned long start, unsigned long end);
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 extern void flush_dcache_page(struct page *page);
 
 #define flush_icache_page(vma, pg)	do { } while(0)
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index b54f6afe7ec4..9076add593a8 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -12,6 +12,7 @@ static inline void flush_cache_range(struct vm_area_struct *vma,
 				     unsigned long start, unsigned long end) { }
 static inline void flush_cache_page(struct vm_area_struct *vma,
 				    unsigned long vmaddr, unsigned long pfn) { }
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 static inline void flush_dcache_page(struct page *page) { }
 static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
 static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
diff --git a/arch/xtensa/include/asm/cacheflush.h b/arch/xtensa/include/asm/cacheflush.h
index b7b8fbe47c77..a508f2f73bd7 100644
--- a/arch/xtensa/include/asm/cacheflush.h
+++ b/arch/xtensa/include/asm/cacheflush.h
@@ -101,6 +101,7 @@ static inline void __invalidate_icache_page_alias(unsigned long virt,
 #define flush_cache_vmap(start,end)	flush_cache_all()
 #define flush_cache_vunmap(start,end)	flush_cache_all()
 
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 extern void flush_dcache_page(struct page*);
 extern void flush_cache_range(struct vm_area_struct*, ulong, ulong);
 extern void flush_cache_page(struct vm_area_struct*, unsigned long, unsigned long);
diff --git a/block/blk-core.c b/block/blk-core.c
index 71da5111120c..718897e6d37f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2358,6 +2358,25 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		rq->rq_disk = bio->bi_bdev->bd_disk;
 }
 
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+/**
+ * rq_flush_dcache_pages - Helper function to flush all pages in a request
+ * @rq: the request to be flushed
+ *
+ * Description:
+ *     Flush all pages in @rq.
+ */
+void rq_flush_dcache_pages(struct request *rq)
+{
+	struct req_iterator iter;
+	struct bio_vec *bvec;
+
+	rq_for_each_segment(bvec, rq, iter)
+		flush_dcache_page(bvec->bv_page);
+}
+EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
+#endif
+
 /**
  * blk_lld_busy - Check if underlying low-level drivers of a device are busy
  * @q : the queue of the device being checked
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 8ca17a3e96ea..64e2b379a350 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -59,12 +59,14 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 		for (; nsect > 0; nsect--, block++, buf += tr->blksize)
 			if (tr->readsect(dev, block, buf))
 				return -EIO;
+		rq_flush_dcache_pages(req);
 		return 0;
 
 	case WRITE:
 		if (!tr->writesect)
 			return -EIO;
 
+		rq_flush_dcache_pages(req);
 		for (; nsect > 0; nsect--, block++, buf += tr->blksize)
 			if (tr->writesect(dev, block, buf))
 				return -EIO;
diff --git a/fs/bio.c b/fs/bio.c
index 12da5db8682c..e23a63f4f7de 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1393,6 +1393,18 @@ void bio_check_pages_dirty(struct bio *bio)
 	}
 }
 
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+void bio_flush_dcache_pages(struct bio *bi)
+{
+	int i;
+	struct bio_vec *bvec;
+
+	bio_for_each_segment(bvec, bi, i)
+		flush_dcache_page(bvec->bv_page);
+}
+EXPORT_SYMBOL(bio_flush_dcache_pages);
+#endif
+
 /**
  * bio_endio - end I/O on a bio
  * @bio:	bio
diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h
index ba4ec39a1131..57b5c3c82e86 100644
--- a/include/asm-generic/cacheflush.h
+++ b/include/asm-generic/cacheflush.h
@@ -13,6 +13,7 @@
 #define flush_cache_dup_mm(mm)			do { } while (0)
 #define flush_cache_range(vma, start, end)	do { } while (0)
 #define flush_cache_page(vma, vmaddr, pfn)	do { } while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)			do { } while (0)
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)	do { } while (0)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 474792b825d0..7fc5606e6ea5 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -391,6 +391,18 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int,
 				 gfp_t, int);
 extern void bio_set_pages_dirty(struct bio *bio);
 extern void bio_check_pages_dirty(struct bio *bio);
+
+#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+# error	"You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
+#endif
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+extern void bio_flush_dcache_pages(struct bio *bi);
+#else
+static inline void bio_flush_dcache_pages(struct bio *bi)
+{
+}
+#endif
+
 extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *,
 				 unsigned long, unsigned int, int, gfp_t);
 extern struct bio *bio_copy_user_iov(struct request_queue *,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1cc02972fbe2..e727f6c44c44 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -752,6 +752,17 @@ struct req_iterator {
 #define rq_iter_last(rq, _iter)					\
 		(_iter.bio->bi_next == NULL && _iter.i == _iter.bio->bi_vcnt-1)
 
+#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+# error	"You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
+#endif
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+extern void rq_flush_dcache_pages(struct request *rq);
+#else
+static inline void rq_flush_dcache_pages(struct request *rq)
+{
+}
+#endif
+
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
 extern void register_disk(struct gendisk *dev);
-- 
cgit v1.2.3


From d9449ce35a1e8fb58dd2d419f9215562a14ecca0 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 26 Nov 2009 09:45:40 +0100
Subject: Fix regression in direct writes performance due to WRITE_ODIRECT flag
 removal

There seems to be a regression in direct write path due to following
commit in for-2.6.33 branch of block tree.

commit 1af60fbd759d31f565552fea315c2033947cfbe6
Author: Jeff Moyer <jmoyer@redhat.com>
Date:   Fri Oct 2 18:56:53 2009 -0400

    block: get rid of the WRITE_ODIRECT flag

Marking direct writes as WRITE_SYNC_PLUG instead of WRITE_ODIRECT, sets
the NOIDLE flag in bio and hence in request. This tells CFQ to not expect
more request from the queue and not idle on it (despite the fact that
queue's think time is less and it is not seeky).

So direct writers lose big time when competing with sequential readers.

Using fio, I have run one direct writer and two sequential readers and
following are the results with 2.6.32-rc7 kernel and with for-2.6.33
branch.

Test
====
1 direct writer and 2 sequential reader running simultaneously.

[global]
directory=/mnt/sdc/fio/
runtime=10

[seqwrite]
rw=write
size=4G
direct=1

[seqread]
rw=read
size=2G
numjobs=2

2.6.32-rc7
==========
direct writes: aggrb=2,968KB/s
readers	     : aggrb=101MB/s

for-2.6.33 branch
=================
direct write: aggrb=19KB/s
readers	      aggrb=137MB/s

This patch brings back the WRITE_ODIRECT flag, with the difference that we
don't set the BIO_RW_UNPLUG flag so that device is not unplugged after
submission of request and an explicit unplug from submitter is required.

That way we fix the jeff's issue of not enough merging taking place in aio
path as well as make sure direct writes get their fair share.

After the fix
=============
for-2.6.33 + fix
----------------
direct writes: aggrb=2,728KB/s
reads: aggrb=103MB/s

Thanks
Vivek

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/direct-io.c     | 2 +-
 include/linux/fs.h | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 3af761c8c5cc..b912270942fa 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1124,7 +1124,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	int acquire_i_mutex = 0;
 
 	if (rw & WRITE)
-		rw = WRITE_SYNC_PLUG;
+		rw = WRITE_ODIRECT_PLUG;
 
 	if (bdev)
 		bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2f5fca4147c2..79cea8051736 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -129,6 +129,7 @@ struct inodes_stat_t {
  * WRITE_SYNC		Like WRITE_SYNC_PLUG, but also unplugs the device
  *			immediately after submission. The write equivalent
  *			of READ_SYNC.
+ * WRITE_ODIRECT_PLUG	Special case write for O_DIRECT only.
  * SWRITE_SYNC
  * SWRITE_SYNC_PLUG	Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer.
  *			See SWRITE.
@@ -150,6 +151,7 @@ struct inodes_stat_t {
 #define READ_META	(READ | (1 << BIO_RW_META))
 #define WRITE_SYNC_PLUG	(WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
 #define WRITE_SYNC	(WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
+#define WRITE_ODIRECT_PLUG	(WRITE | (1 << BIO_RW_SYNCIO))
 #define SWRITE_SYNC_PLUG	\
 			(SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
 #define SWRITE_SYNC	(SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
-- 
cgit v1.2.3


From d180c5bccec02612256fd8076ff3c1fac3429553 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Thu, 26 Nov 2009 14:48:30 +0900
Subject: sched: Introduce task_times() to replace task_{u,s}time() pair

Functions task_{u,s}time() are called in pair in almost all
cases.  However task_stime() is implemented to call task_utime()
from its inside, so such paired calls run task_utime() twice.

It means we do heavy divisions (div_u64 + do_div) twice to get
utime and stime which can be obtained at same time by one set
of divisions.

This patch introduces a function task_times(*tsk, *utime,
*stime) to retrieve utime and stime at once in better, optimized
way.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
LKML-Reference: <4B0E16AE.906@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/array.c       |  3 +--
 include/linux/sched.h |  1 +
 kernel/exit.c         |  7 +++++--
 kernel/sched.c        | 55 ++++++++++++++++++++++++++++++++-------------------
 kernel/sys.c          |  3 +--
 5 files changed, 43 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index e209f64ab27b..330deda70d08 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -535,8 +535,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	if (!whole) {
 		min_flt = task->min_flt;
 		maj_flt = task->maj_flt;
-		utime = task_utime(task);
-		stime = task_stime(task);
+		task_times(task, &utime, &stime);
 		gtime = task_gtime(task);
 	}
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 78ba664474f3..fe6ae1516640 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1723,6 +1723,7 @@ static inline void put_task_struct(struct task_struct *t)
 extern cputime_t task_utime(struct task_struct *p);
 extern cputime_t task_stime(struct task_struct *p);
 extern cputime_t task_gtime(struct task_struct *p);
+extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
 
 /*
  * Per process flags
diff --git a/kernel/exit.c b/kernel/exit.c
index f7864ac2ecc1..29068ab2670a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -91,6 +91,8 @@ static void __exit_signal(struct task_struct *tsk)
 	if (atomic_dec_and_test(&sig->count))
 		posix_cpu_timers_exit_group(tsk);
 	else {
+		cputime_t utime, stime;
+
 		/*
 		 * If there is any task waiting for the group exit
 		 * then notify it:
@@ -110,8 +112,9 @@ static void __exit_signal(struct task_struct *tsk)
 		 * We won't ever get here for the group leader, since it
 		 * will have been the last reference on the signal_struct.
 		 */
-		sig->utime = cputime_add(sig->utime, task_utime(tsk));
-		sig->stime = cputime_add(sig->stime, task_stime(tsk));
+		task_times(tsk, &utime, &stime);
+		sig->utime = cputime_add(sig->utime, utime);
+		sig->stime = cputime_add(sig->stime, stime);
 		sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
 		sig->min_flt += tsk->min_flt;
 		sig->maj_flt += tsk->maj_flt;
diff --git a/kernel/sched.c b/kernel/sched.c
index 315ba4059f93..475a6f2b7158 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5191,6 +5191,14 @@ cputime_t task_stime(struct task_struct *p)
 {
 	return p->stime;
 }
+
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	if (ut)
+		*ut = task_utime(p);
+	if (st)
+		*st = task_stime(p);
+}
 #else
 
 #ifndef nsecs_to_cputime
@@ -5198,41 +5206,48 @@ cputime_t task_stime(struct task_struct *p)
 	msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC))
 #endif
 
-cputime_t task_utime(struct task_struct *p)
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
-	cputime_t utime = p->utime, total = utime + p->stime;
-	u64 temp;
+	cputime_t rtime, utime = p->utime, total = utime + p->stime;
 
 	/*
 	 * Use CFS's precise accounting:
 	 */
-	temp = (u64)nsecs_to_cputime(p->se.sum_exec_runtime);
+	rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
 
 	if (total) {
-		temp *= utime;
+		u64 temp;
+
+		temp = (u64)(rtime * utime);
 		do_div(temp, total);
-	}
-	utime = (cputime_t)temp;
+		utime = (cputime_t)temp;
+	} else
+		utime = rtime;
 
+	/*
+	 * Compare with previous values, to keep monotonicity:
+	 */
 	p->prev_utime = max(p->prev_utime, utime);
-	return p->prev_utime;
+	p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
+
+	if (ut)
+		*ut = p->prev_utime;
+	if (st)
+		*st = p->prev_stime;
+}
+
+cputime_t task_utime(struct task_struct *p)
+{
+	cputime_t utime;
+	task_times(p, &utime, NULL);
+	return utime;
 }
 
 cputime_t task_stime(struct task_struct *p)
 {
 	cputime_t stime;
-
-	/*
-	 * Use CFS's precise accounting. (we subtract utime from
-	 * the total, to make sure the total observed by userspace
-	 * grows monotonically - apps rely on that):
-	 */
-	stime = nsecs_to_cputime(p->se.sum_exec_runtime) - task_utime(p);
-
-	if (stime >= 0)
-		p->prev_stime = max(p->prev_stime, stime);
-
-	return p->prev_stime;
+	task_times(p, NULL, &stime);
+	return stime;
 }
 #endif
 
diff --git a/kernel/sys.c b/kernel/sys.c
index ce17760d9c51..bbdfce0d4347 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1346,8 +1346,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 	utime = stime = cputime_zero;
 
 	if (who == RUSAGE_THREAD) {
-		utime = task_utime(current);
-		stime = task_stime(current);
+		task_times(current, &utime, &stime);
 		accumulate_thread_rusage(p, r);
 		maxrss = p->signal->maxrss;
 		goto out;
-- 
cgit v1.2.3


From d5b7c78e975302a1bab28263266c39ecb71acad4 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Thu, 26 Nov 2009 14:49:05 +0900
Subject: sched: Remove task_{u,s,g}time()

Now all task_{u,s}time() pairs are replaced by task_times().
And task_gtime() is too simple to be an inline function.

Cleanup them all.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
LKML-Reference: <4B0E16D1.70902@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/array.c       |  4 ++--
 include/linux/sched.h |  3 ---
 kernel/exit.c         |  2 +-
 kernel/sched.c        | 33 ++-------------------------------
 4 files changed, 5 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 330deda70d08..ca61a88aed66 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -511,7 +511,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 			do {
 				min_flt += t->min_flt;
 				maj_flt += t->maj_flt;
-				gtime = cputime_add(gtime, task_gtime(t));
+				gtime = cputime_add(gtime, t->gtime);
 				t = next_thread(t);
 			} while (t != task);
 
@@ -536,7 +536,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		min_flt = task->min_flt;
 		maj_flt = task->maj_flt;
 		task_times(task, &utime, &stime);
-		gtime = task_gtime(task);
+		gtime = task->gtime;
 	}
 
 	/* scale priority and nice values from timeslices to -20..20 */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fe6ae1516640..0395b0f4df3a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1720,9 +1720,6 @@ static inline void put_task_struct(struct task_struct *t)
 		__put_task_struct(t);
 }
 
-extern cputime_t task_utime(struct task_struct *p);
-extern cputime_t task_stime(struct task_struct *p);
-extern cputime_t task_gtime(struct task_struct *p);
 extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
 
 /*
diff --git a/kernel/exit.c b/kernel/exit.c
index 29068ab2670a..2eaf68b634e3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -115,7 +115,7 @@ static void __exit_signal(struct task_struct *tsk)
 		task_times(tsk, &utime, &stime);
 		sig->utime = cputime_add(sig->utime, utime);
 		sig->stime = cputime_add(sig->stime, stime);
-		sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
+		sig->gtime = cputime_add(sig->gtime, tsk->gtime);
 		sig->min_flt += tsk->min_flt;
 		sig->maj_flt += tsk->maj_flt;
 		sig->nvcsw += tsk->nvcsw;
diff --git a/kernel/sched.c b/kernel/sched.c
index 475a6f2b7158..82251c21f785 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5182,22 +5182,12 @@ void account_idle_ticks(unsigned long ticks)
  * Use precise platform statistics if available:
  */
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
-cputime_t task_utime(struct task_struct *p)
-{
-	return p->utime;
-}
-
-cputime_t task_stime(struct task_struct *p)
-{
-	return p->stime;
-}
-
 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
 	if (ut)
-		*ut = task_utime(p);
+		*ut = p->utime;
 	if (st)
-		*st = task_stime(p);
+		*st = p->stime;
 }
 #else
 
@@ -5235,27 +5225,8 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	if (st)
 		*st = p->prev_stime;
 }
-
-cputime_t task_utime(struct task_struct *p)
-{
-	cputime_t utime;
-	task_times(p, &utime, NULL);
-	return utime;
-}
-
-cputime_t task_stime(struct task_struct *p)
-{
-	cputime_t stime;
-	task_times(p, NULL, &stime);
-	return stime;
-}
 #endif
 
-inline cputime_t task_gtime(struct task_struct *p)
-{
-	return p->gtime;
-}
-
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
-- 
cgit v1.2.3


From b7b20df91d43d5e59578b8fc16e895c0c8cbd9b5 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Thu, 26 Nov 2009 14:49:27 +0900
Subject: sched, time: Define nsecs_to_jiffies()

Use of msecs_to_jiffies() for nsecs_to_cputime() have some
problems:

 - The type of msecs_to_jiffies()'s argument is unsigned int, so
   it cannot convert msecs greater than UINT_MAX = about 49.7 days.

 - msecs_to_jiffies() returns MAX_JIFFY_OFFSET if MSB of argument
   is set, assuming that input was negative value.  So it cannot
   convert msecs greater than INT_MAX = about 24.8 days too.

This patch defines a new function nsecs_to_jiffies() that can
deal greater values, and that can deal all incoming values as
unsigned.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Amrico Wang <xiyou.wangcong@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <johnstul@linux.vnet.ibm.com>
LKML-Reference: <4B0E16E7.5070307@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/jiffies.h |  1 +
 kernel/sched.c          |  3 +--
 kernel/time.c           | 30 ++++++++++++++++++++++++++++++
 3 files changed, 32 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 1a9cf78bfce5..6811f4bfc6e7 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -307,6 +307,7 @@ extern clock_t jiffies_to_clock_t(long x);
 extern unsigned long clock_t_to_jiffies(unsigned long x);
 extern u64 jiffies_64_to_clock_t(u64 x);
 extern u64 nsec_to_clock_t(u64 x);
+extern unsigned long nsecs_to_jiffies(u64 n);
 
 #define TIMESTAMP_SIZE	30
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 82251c21f785..b3d4e2be95aa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5192,8 +5192,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 #else
 
 #ifndef nsecs_to_cputime
-# define nsecs_to_cputime(__nsecs) \
-	msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC))
+# define nsecs_to_cputime(__nsecs)	nsecs_to_jiffies(__nsecs)
 #endif
 
 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
diff --git a/kernel/time.c b/kernel/time.c
index 2e2e469a7fec..804798005d19 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -662,6 +662,36 @@ u64 nsec_to_clock_t(u64 x)
 #endif
 }
 
+/**
+ * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
+ *
+ * @n:	nsecs in u64
+ *
+ * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
+ * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
+ * for scheduler, not for use in device drivers to calculate timeout value.
+ *
+ * note:
+ *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
+ *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
+ */
+unsigned long nsecs_to_jiffies(u64 n)
+{
+#if (NSEC_PER_SEC % HZ) == 0
+	/* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
+	return div_u64(n, NSEC_PER_SEC / HZ);
+#elif (HZ % 512) == 0
+	/* overflow after 292 years if HZ = 1024 */
+	return div_u64(n * HZ / 512, NSEC_PER_SEC / 512);
+#else
+	/*
+	 * Generic case - optimized for cases where HZ is a multiple of 3.
+	 * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc.
+	 */
+	return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
+#endif
+}
+
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
 {
-- 
cgit v1.2.3


From 445409602c09219767c06497c0dc2285eac244ed Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 26 Nov 2009 06:07:08 +0000
Subject: veth: move loopback logic to common location

The veth driver contains code to forward an skb
from the start_xmit function of one network
device into the receive path of another device.

Moving that code into a common location lets us
reuse the code for direct forwarding of data
between macvlan ports, and possibly in other
drivers.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/veth.c        | 17 +++--------------
 include/linux/netdevice.h |  2 ++
 net/core/dev.c            | 40 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 45 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 2d657f2314cb..6c4b5a2d7877 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -155,8 +155,6 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct veth_net_stats *stats, *rcv_stats;
 	int length, cpu;
 
-	skb_orphan(skb);
-
 	priv = netdev_priv(dev);
 	rcv = priv->peer;
 	rcv_priv = netdev_priv(rcv);
@@ -168,20 +166,12 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (!(rcv->flags & IFF_UP))
 		goto tx_drop;
 
-	if (skb->len > (rcv->mtu + MTU_PAD))
-		goto rx_drop;
-
-        skb->tstamp.tv64 = 0;
-	skb->pkt_type = PACKET_HOST;
-	skb->protocol = eth_type_trans(skb, rcv);
 	if (dev->features & NETIF_F_NO_CSUM)
 		skb->ip_summed = rcv_priv->ip_summed;
 
-	skb->mark = 0;
-	secpath_reset(skb);
-	nf_reset(skb);
-
-	length = skb->len;
+	length = skb->len + ETH_HLEN;
+	if (dev_forward_skb(rcv, skb) != NET_RX_SUCCESS)
+		goto rx_drop;
 
 	stats->tx_bytes += length;
 	stats->tx_packets++;
@@ -189,7 +179,6 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 	rcv_stats->rx_bytes += length;
 	rcv_stats->rx_packets++;
 
-	netif_rx(skb);
 	return NETDEV_TX_OK;
 
 tx_drop:
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 97873e31661c..9428793775a0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1562,6 +1562,8 @@ extern int		dev_set_mac_address(struct net_device *,
 extern int		dev_hard_start_xmit(struct sk_buff *skb,
 					    struct net_device *dev,
 					    struct netdev_queue *txq);
+extern int		dev_forward_skb(struct net_device *dev,
+					struct sk_buff *skb);
 
 extern int		netdev_budget;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index e65af6041415..7775e8b48deb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -105,6 +105,7 @@
 #include <net/dst.h>
 #include <net/pkt_sched.h>
 #include <net/checksum.h>
+#include <net/xfrm.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/kmod.h>
@@ -1419,6 +1420,45 @@ static inline void net_timestamp(struct sk_buff *skb)
 		skb->tstamp.tv64 = 0;
 }
 
+/**
+ * dev_forward_skb - loopback an skb to another netif
+ *
+ * @dev: destination network device
+ * @skb: buffer to forward
+ *
+ * return values:
+ *	NET_RX_SUCCESS	(no congestion)
+ *	NET_RX_DROP     (packet was dropped)
+ *
+ * dev_forward_skb can be used for injecting an skb from the
+ * start_xmit function of one device into the receive queue
+ * of another device.
+ *
+ * The receiving device may be in another namespace, so
+ * we have to clear all information in the skb that could
+ * impact namespace isolation.
+ */
+int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+{
+	skb_orphan(skb);
+
+	if (!(dev->flags & IFF_UP))
+		return NET_RX_DROP;
+
+	if (skb->len > (dev->mtu + dev->hard_header_len))
+		return NET_RX_DROP;
+
+	skb_dst_drop(skb);
+	skb->tstamp.tv64 = 0;
+	skb->pkt_type = PACKET_HOST;
+	skb->protocol = eth_type_trans(skb, dev);
+	skb->mark = 0;
+	secpath_reset(skb);
+	nf_reset(skb);
+	return netif_rx(skb);
+}
+EXPORT_SYMBOL_GPL(dev_forward_skb);
+
 /*
  *	Support routine. Sends outgoing frames to any network
  *	taps currently in use.
-- 
cgit v1.2.3


From 27c0b1a850cdea6298f573d835782f3337be913c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 26 Nov 2009 06:07:11 +0000
Subject: macvlan: export macvlan mode through netlink

In order to support all three modes of macvlan at
runtime, extend the existing netlink protocol
to allow choosing the mode per macvlan slave
interface.

This depends on a matching patch to iproute2
in order to become accessible in user land.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvlan.c   | 56 +++++++++++++++++++++++++++++++++++++++++++------
 include/linux/if_link.h | 15 +++++++++++++
 2 files changed, 65 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index d6bd84353f44..322112c7358a 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -33,12 +33,6 @@
 
 #define MACVLAN_HASH_SIZE	(1 << BITS_PER_BYTE)
 
-enum macvlan_mode {
-	MACVLAN_MODE_PRIVATE	= 1,
-	MACVLAN_MODE_VEPA	= 2,
-	MACVLAN_MODE_BRIDGE	= 4,
-};
-
 struct macvlan_port {
 	struct net_device	*dev;
 	struct hlist_head	vlan_hash[MACVLAN_HASH_SIZE];
@@ -614,6 +608,17 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[])
 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
 			return -EADDRNOTAVAIL;
 	}
+
+	if (data && data[IFLA_MACVLAN_MODE]) {
+		switch (nla_get_u32(data[IFLA_MACVLAN_MODE])) {
+		case MACVLAN_MODE_PRIVATE:
+		case MACVLAN_MODE_VEPA:
+		case MACVLAN_MODE_BRIDGE:
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
 	return 0;
 }
 
@@ -678,6 +683,10 @@ static int macvlan_newlink(struct net *src_net, struct net_device *dev,
 	vlan->dev      = dev;
 	vlan->port     = port;
 
+	vlan->mode     = MACVLAN_MODE_VEPA;
+	if (data && data[IFLA_MACVLAN_MODE])
+		vlan->mode = nla_get_u32(data[IFLA_MACVLAN_MODE]);
+
 	err = register_netdevice(dev);
 	if (err < 0)
 		return err;
@@ -699,6 +708,36 @@ static void macvlan_dellink(struct net_device *dev, struct list_head *head)
 		macvlan_port_destroy(port->dev);
 }
 
+static int macvlan_changelink(struct net_device *dev,
+		struct nlattr *tb[], struct nlattr *data[])
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	if (data && data[IFLA_MACVLAN_MODE])
+		vlan->mode = nla_get_u32(data[IFLA_MACVLAN_MODE]);
+	return 0;
+}
+
+static size_t macvlan_get_size(const struct net_device *dev)
+{
+	return nla_total_size(4);
+}
+
+static int macvlan_fill_info(struct sk_buff *skb,
+				const struct net_device *dev)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+
+	NLA_PUT_U32(skb, IFLA_MACVLAN_MODE, vlan->mode);
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static const struct nla_policy macvlan_policy[IFLA_MACVLAN_MAX + 1] = {
+	[IFLA_MACVLAN_MODE] = { .type = NLA_U32 },
+};
+
 static struct rtnl_link_ops macvlan_link_ops __read_mostly = {
 	.kind		= "macvlan",
 	.priv_size	= sizeof(struct macvlan_dev),
@@ -707,6 +746,11 @@ static struct rtnl_link_ops macvlan_link_ops __read_mostly = {
 	.validate	= macvlan_validate,
 	.newlink	= macvlan_newlink,
 	.dellink	= macvlan_dellink,
+	.maxtype	= IFLA_MACVLAN_MAX,
+	.policy		= macvlan_policy,
+	.changelink	= macvlan_changelink,
+	.get_size	= macvlan_get_size,
+	.fill_info	= macvlan_fill_info,
 };
 
 static int macvlan_device_event(struct notifier_block *unused,
diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 1d3b2425f661..6674791622ca 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -181,4 +181,19 @@ struct ifla_vlan_qos_mapping {
 	__u32 to;
 };
 
+/* MACVLAN section */
+enum {
+	IFLA_MACVLAN_UNSPEC,
+	IFLA_MACVLAN_MODE,
+	__IFLA_MACVLAN_MAX,
+};
+
+#define IFLA_MACVLAN_MAX (__IFLA_MACVLAN_MAX - 1)
+
+enum macvlan_mode {
+	MACVLAN_MODE_PRIVATE = 1, /* don't talk to other macvlans */
+	MACVLAN_MODE_VEPA    = 2, /* talk to other ports through ext bridge */
+	MACVLAN_MODE_BRIDGE  = 4, /* talk to bridge ports directly */
+};
+
 #endif /* _LINUX_IF_LINK_H */
-- 
cgit v1.2.3


From 5e7565930524410f097f5b04f8aba663089a6ffc Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 25 Nov 2009 07:54:54 +0000
Subject: vlan: support "loose binding" to the underlying network device

Currently the UP/DOWN state of VLANs is synchronized to the state of the
underlying device, meaning all VLANs are set down once the underlying
device is set down. This causes all routes to the VLAN devices to vanish.

Add a flag to specify a "loose binding" mode, in which only the operstate
is transfered, but the VLAN device state is independant.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h  | 1 +
 net/8021q/vlan.c         | 9 +++++++--
 net/8021q/vlan_dev.c     | 6 ++++--
 net/8021q/vlan_netlink.c | 3 ++-
 4 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 153f6b9e722c..3d870fda8c4f 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -339,6 +339,7 @@ enum vlan_ioctl_cmds {
 enum vlan_flags {
 	VLAN_FLAG_REORDER_HDR	= 0x1,
 	VLAN_FLAG_GVRP		= 0x2,
+	VLAN_FLAG_LOOSE_BINDING	= 0x4,
 };
 
 enum vlan_name_types {
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 1483243edf14..225aa2fac0e3 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -431,6 +431,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 	struct vlan_group *grp;
 	int i, flgs;
 	struct net_device *vlandev;
+	struct vlan_dev_info *vlan;
 	LIST_HEAD(list);
 
 	if (is_vlan_dev(dev))
@@ -507,7 +508,9 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 			if (!(flgs & IFF_UP))
 				continue;
 
-			dev_change_flags(vlandev, flgs & ~IFF_UP);
+			vlan = vlan_dev_info(vlandev);
+			if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING))
+				dev_change_flags(vlandev, flgs & ~IFF_UP);
 			vlan_transfer_operstate(dev, vlandev);
 		}
 		break;
@@ -523,7 +526,9 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 			if (flgs & IFF_UP)
 				continue;
 
-			dev_change_flags(vlandev, flgs | IFF_UP);
+			vlan = vlan_dev_info(vlandev);
+			if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING))
+				dev_change_flags(vlandev, flgs | IFF_UP);
 			vlan_transfer_operstate(dev, vlandev);
 		}
 		break;
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index de0dc6bacbe8..b7889782047e 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -431,7 +431,8 @@ int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask)
 	struct vlan_dev_info *vlan = vlan_dev_info(dev);
 	u32 old_flags = vlan->flags;
 
-	if (mask & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP))
+	if (mask & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP |
+		     VLAN_FLAG_LOOSE_BINDING))
 		return -EINVAL;
 
 	vlan->flags = (old_flags & ~mask) | (flags & mask);
@@ -456,7 +457,8 @@ static int vlan_dev_open(struct net_device *dev)
 	struct net_device *real_dev = vlan->real_dev;
 	int err;
 
-	if (!(real_dev->flags & IFF_UP))
+	if (!(real_dev->flags & IFF_UP) &&
+	    !(vlan->flags & VLAN_FLAG_LOOSE_BINDING))
 		return -ENETDOWN;
 
 	if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr)) {
diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
index 3c9cf6a8e7fb..ddc105734af7 100644
--- a/net/8021q/vlan_netlink.c
+++ b/net/8021q/vlan_netlink.c
@@ -60,7 +60,8 @@ static int vlan_validate(struct nlattr *tb[], struct nlattr *data[])
 	if (data[IFLA_VLAN_FLAGS]) {
 		flags = nla_data(data[IFLA_VLAN_FLAGS]);
 		if ((flags->flags & flags->mask) &
-		    ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP))
+		    ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP |
+		      VLAN_FLAG_LOOSE_BINDING))
 			return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From 5fa10b28e57f94a90535cfeafe89dcee9f47d540 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 27 Nov 2009 04:55:53 +0100
Subject: hw-breakpoints: Use struct perf_event_attr to define user breakpoints

In-kernel user breakpoints are created using functions in which
we pass breakpoint parameters as individual variables: address,
length and type.

Although it fits well for x86, this just does not scale across
archictectures that may support this api later as these may have
more or different needs. Pass in a perf_event_attr structure
instead because it is meant to evolve as much as possible into
a generic hardware breakpoint parameter structure.

Reported-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1259294154-5197-1-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ptrace.c      | 74 ++++++++++++++++++++----------------
 include/linux/hw_breakpoint.h | 36 ++++++++----------
 kernel/hw_breakpoint.c        | 87 +++++++++----------------------------------
 3 files changed, 75 insertions(+), 122 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 75e0cd847bd6..2941b32ea666 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -593,6 +593,34 @@ static unsigned long ptrace_get_dr7(struct perf_event *bp[])
 	return dr7;
 }
 
+static struct perf_event *
+ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
+			 struct task_struct *tsk)
+{
+	int err;
+	int gen_len, gen_type;
+	DEFINE_BREAKPOINT_ATTR(attr);
+
+	/*
+	 * We shoud have at least an inactive breakpoint at this
+	 * slot. It means the user is writing dr7 without having
+	 * written the address register first
+	 */
+	if (!bp)
+		return ERR_PTR(-EINVAL);
+
+	err = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
+	if (err)
+		return ERR_PTR(err);
+
+	attr = bp->attr;
+	attr.bp_len = gen_len;
+	attr.bp_type = gen_type;
+	attr.disabled = 0;
+
+	return modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk);
+}
+
 /*
  * Handle ptrace writes to debug register 7.
  */
@@ -603,7 +631,6 @@ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
 	int i, orig_ret = 0, rc = 0;
 	int enabled, second_pass = 0;
 	unsigned len, type;
-	int gen_len, gen_type;
 	struct perf_event *bp;
 
 	data &= ~DR_CONTROL_RESERVED;
@@ -634,33 +661,12 @@ restore:
 			continue;
 		}
 
-		/*
-		 * We shoud have at least an inactive breakpoint at this
-		 * slot. It means the user is writing dr7 without having
-		 * written the address register first
-		 */
-		if (!bp) {
-			rc = -EINVAL;
-			break;
-		}
-
-		rc = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
-		if (rc)
-			break;
-
-		/*
-		 * This is a temporary thing as bp is unregistered/registered
-		 * to simulate modification
-		 */
-		bp = modify_user_hw_breakpoint(bp, bp->attr.bp_addr, gen_len,
-					       gen_type, bp->callback,
-					       tsk, true);
-		thread->ptrace_bps[i] = NULL;
+		bp = ptrace_modify_breakpoint(bp, len, type, tsk);
 
 		/* Incorrect bp, or we have a bug in bp API */
 		if (IS_ERR(bp)) {
 			rc = PTR_ERR(bp);
-			bp = NULL;
+			thread->ptrace_bps[i] = NULL;
 			break;
 		}
 		thread->ptrace_bps[i] = bp;
@@ -707,24 +713,26 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
 {
 	struct perf_event *bp;
 	struct thread_struct *t = &tsk->thread;
+	DEFINE_BREAKPOINT_ATTR(attr);
 
 	if (!t->ptrace_bps[nr]) {
 		/*
 		 * Put stub len and type to register (reserve) an inactive but
 		 * correct bp
 		 */
-		bp = register_user_hw_breakpoint(addr, HW_BREAKPOINT_LEN_1,
-						 HW_BREAKPOINT_W,
-						 ptrace_triggered, tsk,
-						 false);
+		attr.bp_addr = addr;
+		attr.bp_len = HW_BREAKPOINT_LEN_1;
+		attr.bp_type = HW_BREAKPOINT_W;
+		attr.disabled = 1;
+
+		bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
 	} else {
 		bp = t->ptrace_bps[nr];
 		t->ptrace_bps[nr] = NULL;
-		bp = modify_user_hw_breakpoint(bp, addr, bp->attr.bp_len,
-					       bp->attr.bp_type,
-					       bp->callback,
-					       tsk,
-					       bp->attr.disabled);
+
+		attr = bp->attr;
+		attr.bp_addr = addr;
+		bp = modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk);
 	}
 	/*
 	 * CHECKME: the previous code returned -EIO if the addr wasn't a
diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index c9f7f7c7b0e0..5da472e434b7 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -20,6 +20,14 @@ enum {
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 
+/* As it's for in-kernel or ptrace use, we want it to be pinned */
+#define DEFINE_BREAKPOINT_ATTR(name)	\
+struct perf_event_attr name = {		\
+	.type = PERF_TYPE_BREAKPOINT,	\
+	.size = sizeof(name),		\
+	.pinned = 1,			\
+};
+
 static inline unsigned long hw_breakpoint_addr(struct perf_event *bp)
 {
 	return bp->attr.bp_addr;
@@ -36,22 +44,16 @@ static inline int hw_breakpoint_len(struct perf_event *bp)
 }
 
 extern struct perf_event *
-register_user_hw_breakpoint(unsigned long addr,
-			    int len,
-			    int type,
+register_user_hw_breakpoint(struct perf_event_attr *attr,
 			    perf_callback_t triggered,
-			    struct task_struct *tsk,
-			    bool active);
+			    struct task_struct *tsk);
 
 /* FIXME: only change from the attr, and don't unregister */
 extern struct perf_event *
 modify_user_hw_breakpoint(struct perf_event *bp,
-			  unsigned long addr,
-			  int len,
-			  int type,
+			  struct perf_event_attr *attr,
 			  perf_callback_t triggered,
-			  struct task_struct *tsk,
-			  bool active);
+			  struct task_struct *tsk);
 
 /*
  * Kernel breakpoints are not associated with any particular thread.
@@ -89,20 +91,14 @@ static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp)
 #else /* !CONFIG_HAVE_HW_BREAKPOINT */
 
 static inline struct perf_event *
-register_user_hw_breakpoint(unsigned long addr,
-			    int len,
-			    int type,
+register_user_hw_breakpoint(struct perf_event_attr *attr,
 			    perf_callback_t triggered,
-			    struct task_struct *tsk,
-			    bool active)		{ return NULL; }
+			    struct task_struct *tsk)	{ return NULL; }
 static inline struct perf_event *
 modify_user_hw_breakpoint(struct perf_event *bp,
-			  unsigned long addr,
-			  int len,
-			  int type,
+			  struct perf_event_attr *attr,
 			  perf_callback_t triggered,
-			  struct task_struct *tsk,
-			  bool active)			{ return NULL; }
+			  struct task_struct *tsk)	{ return NULL; }
 static inline struct perf_event *
 register_wide_hw_breakpoint_cpu(unsigned long addr,
 				int len,
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 32e1018191be..2a47514f12fd 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -289,90 +289,32 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
 	return __register_perf_hw_breakpoint(bp);
 }
 
-/*
- * Register a breakpoint bound to a task and a given cpu.
- * If cpu is -1, the breakpoint is active for the task in every cpu
- * If the task is -1, the breakpoint is active for every tasks in the given
- * cpu.
- */
-static struct perf_event *
-register_user_hw_breakpoint_cpu(unsigned long addr,
-				int len,
-				int type,
-				perf_callback_t triggered,
-				pid_t pid,
-				int cpu,
-				bool active)
-{
-	struct perf_event_attr *attr;
-	struct perf_event *bp;
-
-	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
-	if (!attr)
-		return ERR_PTR(-ENOMEM);
-
-	attr->type = PERF_TYPE_BREAKPOINT;
-	attr->size = sizeof(*attr);
-	attr->bp_addr = addr;
-	attr->bp_len = len;
-	attr->bp_type = type;
-	/*
-	 * Such breakpoints are used by debuggers to trigger signals when
-	 * we hit the excepted memory op. We can't miss such events, they
-	 * must be pinned.
-	 */
-	attr->pinned = 1;
-
-	if (!active)
-		attr->disabled = 1;
-
-	bp = perf_event_create_kernel_counter(attr, cpu, pid, triggered);
-	kfree(attr);
-
-	return bp;
-}
-
 /**
  * register_user_hw_breakpoint - register a hardware breakpoint for user space
- * @addr: is the memory address that triggers the breakpoint
- * @len: the length of the access to the memory (1 byte, 2 bytes etc...)
- * @type: the type of the access to the memory (read/write/exec)
+ * @attr: breakpoint attributes
  * @triggered: callback to trigger when we hit the breakpoint
  * @tsk: pointer to 'task_struct' of the process to which the address belongs
- * @active: should we activate it while registering it
- *
  */
 struct perf_event *
-register_user_hw_breakpoint(unsigned long addr,
-			    int len,
-			    int type,
+register_user_hw_breakpoint(struct perf_event_attr *attr,
 			    perf_callback_t triggered,
-			    struct task_struct *tsk,
-			    bool active)
+			    struct task_struct *tsk)
 {
-	return register_user_hw_breakpoint_cpu(addr, len, type, triggered,
-					       tsk->pid, -1, active);
+	return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
 }
 EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
 
 /**
  * modify_user_hw_breakpoint - modify a user-space hardware breakpoint
  * @bp: the breakpoint structure to modify
- * @addr: is the memory address that triggers the breakpoint
- * @len: the length of the access to the memory (1 byte, 2 bytes etc...)
- * @type: the type of the access to the memory (read/write/exec)
+ * @attr: new breakpoint attributes
  * @triggered: callback to trigger when we hit the breakpoint
  * @tsk: pointer to 'task_struct' of the process to which the address belongs
- * @active: should we activate it while registering it
  */
 struct perf_event *
-modify_user_hw_breakpoint(struct perf_event *bp,
-			  unsigned long addr,
-			  int len,
-			  int type,
+modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr,
 			  perf_callback_t triggered,
-			  struct task_struct *tsk,
-			  bool active)
+			  struct task_struct *tsk)
 {
 	/*
 	 * FIXME: do it without unregistering
@@ -381,8 +323,7 @@ modify_user_hw_breakpoint(struct perf_event *bp,
 	 */
 	unregister_hw_breakpoint(bp);
 
-	return register_user_hw_breakpoint(addr, len, type, triggered,
-					   tsk, active);
+	return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
 }
 EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
 
@@ -406,8 +347,16 @@ register_kernel_hw_breakpoint_cpu(unsigned long addr,
 				  int cpu,
 				  bool active)
 {
-	return register_user_hw_breakpoint_cpu(addr, len, type, triggered,
-					       -1, cpu, active);
+	DEFINE_BREAKPOINT_ATTR(attr);
+
+	attr.bp_addr = addr;
+	attr.bp_len = len;
+	attr.bp_type = type;
+
+	if (!active)
+		attr.disabled = 1;
+
+	return perf_event_create_kernel_counter(&attr, cpu, -1, triggered);
 }
 
 /**
-- 
cgit v1.2.3


From dd1853c3f493f6d22d9e5390b192a07b73d2ac0a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 27 Nov 2009 04:55:54 +0100
Subject: hw-breakpoints: Use struct perf_event_attr to define kernel
 breakpoints

Kernel breakpoints are created using functions in which we pass
breakpoint parameters as individual variables: address, length
and type.

Although it fits well for x86, this just does not scale across
architectures that may support this api later as these may have
more or different needs. Pass in a perf_event_attr structure
instead because it is meant to evolve as much as possible into
a generic hardware breakpoint parameter structure.

Reported-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1259294154-5197-2-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/hw_breakpoint.h           | 35 ++++++++++++---------------
 kernel/hw_breakpoint.c                  | 35 ++++-----------------------
 kernel/trace/trace_ksym.c               | 42 ++++++++++++++++-----------------
 samples/hw_breakpoint/data_breakpoint.c | 10 ++++----
 4 files changed, 44 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index 5da472e434b7..a03daed08c59 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -28,6 +28,13 @@ struct perf_event_attr name = {		\
 	.pinned = 1,			\
 };
 
+static inline void hw_breakpoint_init(struct perf_event_attr *attr)
+{
+	attr->type = PERF_TYPE_BREAKPOINT;
+	attr->size = sizeof(*attr);
+	attr->pinned = 1;
+}
+
 static inline unsigned long hw_breakpoint_addr(struct perf_event *bp)
 {
 	return bp->attr.bp_addr;
@@ -59,19 +66,13 @@ modify_user_hw_breakpoint(struct perf_event *bp,
  * Kernel breakpoints are not associated with any particular thread.
  */
 extern struct perf_event *
-register_wide_hw_breakpoint_cpu(unsigned long addr,
-				int len,
-				int type,
+register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr,
 				perf_callback_t triggered,
-				int cpu,
-				bool active);
+				int cpu);
 
 extern struct perf_event **
-register_wide_hw_breakpoint(unsigned long addr,
-			    int len,
-			    int type,
-			    perf_callback_t triggered,
-			    bool active);
+register_wide_hw_breakpoint(struct perf_event_attr *attr,
+			    perf_callback_t triggered);
 
 extern int register_perf_hw_breakpoint(struct perf_event *bp);
 extern int __register_perf_hw_breakpoint(struct perf_event *bp);
@@ -100,18 +101,12 @@ modify_user_hw_breakpoint(struct perf_event *bp,
 			  perf_callback_t triggered,
 			  struct task_struct *tsk)	{ return NULL; }
 static inline struct perf_event *
-register_wide_hw_breakpoint_cpu(unsigned long addr,
-				int len,
-				int type,
+register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr,
 				perf_callback_t triggered,
-				int cpu,
-				bool active)		{ return NULL; }
+				int cpu)		{ return NULL; }
 static inline struct perf_event **
-register_wide_hw_breakpoint(unsigned long addr,
-			    int len,
-			    int type,
-			    perf_callback_t triggered,
-			    bool active)		{ return NULL; }
+register_wide_hw_breakpoint(struct perf_event_attr *attr,
+			    perf_callback_t triggered)	{ return NULL; }
 static inline int
 register_perf_hw_breakpoint(struct perf_event *bp)	{ return -ENOSYS; }
 static inline int
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 2a47514f12fd..cf5ee1628411 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -339,42 +339,16 @@ void unregister_hw_breakpoint(struct perf_event *bp)
 }
 EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
 
-static struct perf_event *
-register_kernel_hw_breakpoint_cpu(unsigned long addr,
-				  int len,
-				  int type,
-				  perf_callback_t triggered,
-				  int cpu,
-				  bool active)
-{
-	DEFINE_BREAKPOINT_ATTR(attr);
-
-	attr.bp_addr = addr;
-	attr.bp_len = len;
-	attr.bp_type = type;
-
-	if (!active)
-		attr.disabled = 1;
-
-	return perf_event_create_kernel_counter(&attr, cpu, -1, triggered);
-}
-
 /**
  * register_wide_hw_breakpoint - register a wide breakpoint in the kernel
- * @addr: is the memory address that triggers the breakpoint
- * @len: the length of the access to the memory (1 byte, 2 bytes etc...)
- * @type: the type of the access to the memory (read/write/exec)
+ * @attr: breakpoint attributes
  * @triggered: callback to trigger when we hit the breakpoint
- * @active: should we activate it while registering it
  *
  * @return a set of per_cpu pointers to perf events
  */
 struct perf_event **
-register_wide_hw_breakpoint(unsigned long addr,
-			    int len,
-			    int type,
-			    perf_callback_t triggered,
-			    bool active)
+register_wide_hw_breakpoint(struct perf_event_attr *attr,
+			    perf_callback_t triggered)
 {
 	struct perf_event **cpu_events, **pevent, *bp;
 	long err;
@@ -386,8 +360,7 @@ register_wide_hw_breakpoint(unsigned long addr,
 
 	for_each_possible_cpu(cpu) {
 		pevent = per_cpu_ptr(cpu_events, cpu);
-		bp = register_kernel_hw_breakpoint_cpu(addr, len, type,
-					triggered, cpu, active);
+		bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
 
 		*pevent = bp;
 
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index c538b15b95d6..ddfa0fd43bc0 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -42,9 +42,7 @@
 
 struct trace_ksym {
 	struct perf_event	**ksym_hbp;
-	unsigned long		ksym_addr;
-	int			type;
-	int			len;
+	struct perf_event_attr	attr;
 #ifdef CONFIG_PROFILE_KSYM_TRACER
 	unsigned long		counter;
 #endif
@@ -71,7 +69,7 @@ void ksym_collect_stats(unsigned long hbp_hit_addr)
 
 	rcu_read_lock();
 	hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
-		if ((entry->ksym_addr == hbp_hit_addr) &&
+		if ((entry->attr.bp_addr == hbp_hit_addr) &&
 		    (entry->counter <= MAX_UL_INT)) {
 			entry->counter++;
 			break;
@@ -192,14 +190,15 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
 	if (!entry)
 		return -ENOMEM;
 
-	entry->type = op;
-	entry->ksym_addr = addr;
-	entry->len = HW_BREAKPOINT_LEN_4;
+	hw_breakpoint_init(&entry->attr);
+
+	entry->attr.bp_type = op;
+	entry->attr.bp_addr = addr;
+	entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
 
 	ret = -EAGAIN;
-	entry->ksym_hbp = register_wide_hw_breakpoint(entry->ksym_addr,
-					entry->len, entry->type,
-					ksym_hbp_handler, true);
+	entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
+					ksym_hbp_handler);
 
 	if (IS_ERR(entry->ksym_hbp)) {
 		ret = PTR_ERR(entry->ksym_hbp);
@@ -236,12 +235,12 @@ static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
 	mutex_lock(&ksym_tracer_mutex);
 
 	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
-		ret = trace_seq_printf(s, "%pS:", (void *)entry->ksym_addr);
-		if (entry->type == HW_BREAKPOINT_R)
+		ret = trace_seq_printf(s, "%pS:", (void *)entry->attr.bp_addr);
+		if (entry->attr.bp_type == HW_BREAKPOINT_R)
 			ret = trace_seq_puts(s, "r--\n");
-		else if (entry->type == HW_BREAKPOINT_W)
+		else if (entry->attr.bp_type == HW_BREAKPOINT_W)
 			ret = trace_seq_puts(s, "-w-\n");
-		else if (entry->type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
+		else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
 			ret = trace_seq_puts(s, "rw-\n");
 		WARN_ON_ONCE(!ret);
 	}
@@ -317,9 +316,9 @@ static ssize_t ksym_trace_filter_write(struct file *file,
 
 	ret = -EINVAL;
 	hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
-		if (entry->ksym_addr == ksym_addr) {
+		if (entry->attr.bp_addr == ksym_addr) {
 			/* Check for malformed request: (6) */
-			if (entry->type != op)
+			if (entry->attr.bp_type != op)
 				changed = 1;
 			else
 				goto out;
@@ -328,13 +327,12 @@ static ssize_t ksym_trace_filter_write(struct file *file,
 	}
 	if (changed) {
 		unregister_wide_hw_breakpoint(entry->ksym_hbp);
-		entry->type = op;
+		entry->attr.bp_type = op;
 		ret = 0;
 		if (op > 0) {
 			entry->ksym_hbp =
-				register_wide_hw_breakpoint(entry->ksym_addr,
-					entry->len, entry->type,
-					ksym_hbp_handler, true);
+				register_wide_hw_breakpoint(&entry->attr,
+					ksym_hbp_handler);
 			if (IS_ERR(entry->ksym_hbp))
 				ret = PTR_ERR(entry->ksym_hbp);
 			else
@@ -489,7 +487,7 @@ static int ksym_tracer_stat_show(struct seq_file *m, void *v)
 
 	entry = hlist_entry(stat, struct trace_ksym, ksym_hlist);
 
-	access_type = entry->type;
+	access_type = entry->attr.bp_type;
 
 	switch (access_type) {
 	case HW_BREAKPOINT_R:
@@ -505,7 +503,7 @@ static int ksym_tracer_stat_show(struct seq_file *m, void *v)
 		seq_puts(m, "  NA          ");
 	}
 
-	if (lookup_symbol_name(entry->ksym_addr, fn_name) >= 0)
+	if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
 		seq_printf(m, "  %-36s", fn_name);
 	else
 		seq_printf(m, "  %-36s", "<NA>");
diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c
index ee7f9fbaffbd..29525500df00 100644
--- a/samples/hw_breakpoint/data_breakpoint.c
+++ b/samples/hw_breakpoint/data_breakpoint.c
@@ -51,13 +51,13 @@ static void sample_hbp_handler(struct perf_event *temp, void *data)
 static int __init hw_break_module_init(void)
 {
 	int ret;
-	unsigned long addr;
+	DEFINE_BREAKPOINT_ATTR(attr);
 
-	addr = kallsyms_lookup_name(ksym_name);
+	attr.bp_addr = kallsyms_lookup_name(ksym_name);
+	attr.bp_len = HW_BREAKPOINT_LEN_4;
+	attr.bp_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
 
-	sample_hbp = register_wide_hw_breakpoint(addr, HW_BREAKPOINT_LEN_4,
-						 HW_BREAKPOINT_W | HW_BREAKPOINT_R,
-						 sample_hbp_handler, true);
+	sample_hbp = register_wide_hw_breakpoint(&attr, sample_hbp_handler);
 	if (IS_ERR(sample_hbp)) {
 		ret = PTR_ERR(sample_hbp);
 		goto fail;
-- 
cgit v1.2.3


From c8602edf3f9471466755329b78d309f2a01dd449 Mon Sep 17 00:00:00 2001
From: Thomas Kunze <thommycheck@gmx.de>
Date: Tue, 10 Feb 2009 14:54:57 +0100
Subject: move drivers/mfd/*.h to include/linux/mfd

So drivers like collie_battery driver can use
those files easier.
---
 drivers/mfd/mcp-core.c        |   2 +-
 drivers/mfd/mcp-sa11x0.c      |   2 +-
 drivers/mfd/mcp.h             |  66 -----------
 drivers/mfd/ucb1x00-assabet.c |   2 +-
 drivers/mfd/ucb1x00-core.c    |   2 +-
 drivers/mfd/ucb1x00-ts.c      |   2 +-
 drivers/mfd/ucb1x00.h         | 255 ------------------------------------------
 include/linux/mfd/mcp.h       |  68 +++++++++++
 include/linux/mfd/ucb1x00.h   | 255 ++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 328 insertions(+), 326 deletions(-)
 delete mode 100644 drivers/mfd/mcp.h
 delete mode 100644 drivers/mfd/ucb1x00.h
 create mode 100644 include/linux/mfd/mcp.h
 create mode 100644 include/linux/mfd/ucb1x00.h

(limited to 'include/linux')

diff --git a/drivers/mfd/mcp-core.c b/drivers/mfd/mcp-core.c
index 57271cb3b316..84815f9ef636 100644
--- a/drivers/mfd/mcp-core.c
+++ b/drivers/mfd/mcp-core.c
@@ -17,11 +17,11 @@
 #include <linux/device.h>
 #include <linux/slab.h>
 #include <linux/string.h>
+#include <linux/mfd/mcp.h>
 
 #include <mach/dma.h>
 #include <asm/system.h>
 
-#include "mcp.h"
 
 #define to_mcp(d)		container_of(d, struct mcp, attached_device)
 #define to_mcp_driver(d)	container_of(d, struct mcp_driver, drv)
diff --git a/drivers/mfd/mcp-sa11x0.c b/drivers/mfd/mcp-sa11x0.c
index 62b32dabf629..212189815c8e 100644
--- a/drivers/mfd/mcp-sa11x0.c
+++ b/drivers/mfd/mcp-sa11x0.c
@@ -19,6 +19,7 @@
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/platform_device.h>
+#include <linux/mfd/mcp.h>
 
 #include <mach/dma.h>
 #include <mach/hardware.h>
@@ -28,7 +29,6 @@
 
 #include <mach/assabet.h>
 
-#include "mcp.h"
 
 struct mcp_sa11x0 {
 	u32	mccr0;
diff --git a/drivers/mfd/mcp.h b/drivers/mfd/mcp.h
deleted file mode 100644
index c093a93b8808..000000000000
--- a/drivers/mfd/mcp.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- *  linux/drivers/mfd/mcp.h
- *
- *  Copyright (C) 2001 Russell King, All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License.
- */
-#ifndef MCP_H
-#define MCP_H
-
-struct mcp_ops;
-
-struct mcp {
-	struct module	*owner;
-	struct mcp_ops	*ops;
-	spinlock_t	lock;
-	int		use_count;
-	unsigned int	sclk_rate;
-	unsigned int	rw_timeout;
-	dma_device_t	dma_audio_rd;
-	dma_device_t	dma_audio_wr;
-	dma_device_t	dma_telco_rd;
-	dma_device_t	dma_telco_wr;
-	struct device	attached_device;
-};
-
-struct mcp_ops {
-	void		(*set_telecom_divisor)(struct mcp *, unsigned int);
-	void		(*set_audio_divisor)(struct mcp *, unsigned int);
-	void		(*reg_write)(struct mcp *, unsigned int, unsigned int);
-	unsigned int	(*reg_read)(struct mcp *, unsigned int);
-	void		(*enable)(struct mcp *);
-	void		(*disable)(struct mcp *);
-};
-
-void mcp_set_telecom_divisor(struct mcp *, unsigned int);
-void mcp_set_audio_divisor(struct mcp *, unsigned int);
-void mcp_reg_write(struct mcp *, unsigned int, unsigned int);
-unsigned int mcp_reg_read(struct mcp *, unsigned int);
-void mcp_enable(struct mcp *);
-void mcp_disable(struct mcp *);
-#define mcp_get_sclk_rate(mcp)	((mcp)->sclk_rate)
-
-struct mcp *mcp_host_alloc(struct device *, size_t);
-int mcp_host_register(struct mcp *);
-void mcp_host_unregister(struct mcp *);
-
-struct mcp_driver {
-	struct device_driver drv;
-	int (*probe)(struct mcp *);
-	void (*remove)(struct mcp *);
-	int (*suspend)(struct mcp *, pm_message_t);
-	int (*resume)(struct mcp *);
-};
-
-int mcp_driver_register(struct mcp_driver *);
-void mcp_driver_unregister(struct mcp_driver *);
-
-#define mcp_get_drvdata(mcp)	dev_get_drvdata(&(mcp)->attached_device)
-#define mcp_set_drvdata(mcp,d)	dev_set_drvdata(&(mcp)->attached_device, d)
-
-#define mcp_priv(mcp)		((void *)((mcp)+1))
-
-#endif
diff --git a/drivers/mfd/ucb1x00-assabet.c b/drivers/mfd/ucb1x00-assabet.c
index 86fed4870f93..cea9da60850d 100644
--- a/drivers/mfd/ucb1x00-assabet.c
+++ b/drivers/mfd/ucb1x00-assabet.c
@@ -14,10 +14,10 @@
 #include <linux/fs.h>
 #include <linux/proc_fs.h>
 #include <linux/device.h>
+#include <linux/mfd/ucb1x00.h>
 
 #include <mach/dma.h>
 
-#include "ucb1x00.h"
 
 #define UCB1X00_ATTR(name,input)\
 static ssize_t name##_show(struct device *dev, struct device_attribute *attr, \
diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c
index 60c3988f3cf3..f9de7891e57f 100644
--- a/drivers/mfd/ucb1x00-core.c
+++ b/drivers/mfd/ucb1x00-core.c
@@ -25,11 +25,11 @@
 #include <linux/interrupt.h>
 #include <linux/device.h>
 #include <linux/mutex.h>
+#include <linux/mfd/ucb1x00.h>
 
 #include <mach/dma.h>
 #include <mach/hardware.h>
 
-#include "ucb1x00.h"
 
 static DEFINE_MUTEX(ucb1x00_mutex);
 static LIST_HEAD(ucb1x00_drivers);
diff --git a/drivers/mfd/ucb1x00-ts.c b/drivers/mfd/ucb1x00-ts.c
index 61b7d3eb9a2f..000cb414a78a 100644
--- a/drivers/mfd/ucb1x00-ts.c
+++ b/drivers/mfd/ucb1x00-ts.c
@@ -30,12 +30,12 @@
 #include <linux/freezer.h>
 #include <linux/slab.h>
 #include <linux/kthread.h>
+#include <linux/mfd/ucb1x00.h>
 
 #include <mach/dma.h>
 #include <mach/collie.h>
 #include <asm/mach-types.h>
 
-#include "ucb1x00.h"
 
 
 struct ucb1x00_ts {
diff --git a/drivers/mfd/ucb1x00.h b/drivers/mfd/ucb1x00.h
deleted file mode 100644
index a8ad8a0ed5db..000000000000
--- a/drivers/mfd/ucb1x00.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- *  linux/drivers/mfd/ucb1x00.h
- *
- *  Copyright (C) 2001 Russell King, All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License.
- */
-#ifndef UCB1200_H
-#define UCB1200_H
-
-#define UCB_IO_DATA	0x00
-#define UCB_IO_DIR	0x01
-
-#define UCB_IO_0		(1 << 0)
-#define UCB_IO_1		(1 << 1)
-#define UCB_IO_2		(1 << 2)
-#define UCB_IO_3		(1 << 3)
-#define UCB_IO_4		(1 << 4)
-#define UCB_IO_5		(1 << 5)
-#define UCB_IO_6		(1 << 6)
-#define UCB_IO_7		(1 << 7)
-#define UCB_IO_8		(1 << 8)
-#define UCB_IO_9		(1 << 9)
-
-#define UCB_IE_RIS	0x02
-#define UCB_IE_FAL	0x03
-#define UCB_IE_STATUS	0x04
-#define UCB_IE_CLEAR	0x04
-#define UCB_IE_ADC		(1 << 11)
-#define UCB_IE_TSPX		(1 << 12)
-#define UCB_IE_TSMX		(1 << 13)
-#define UCB_IE_TCLIP		(1 << 14)
-#define UCB_IE_ACLIP		(1 << 15)
-
-#define UCB_IRQ_TSPX		12
-
-#define UCB_TC_A	0x05
-#define UCB_TC_A_LOOP		(1 << 7)	/* UCB1200 */
-#define UCB_TC_A_AMPL		(1 << 7)	/* UCB1300 */
-
-#define UCB_TC_B	0x06
-#define UCB_TC_B_VOICE_ENA	(1 << 3)
-#define UCB_TC_B_CLIP		(1 << 4)
-#define UCB_TC_B_ATT		(1 << 6)
-#define UCB_TC_B_SIDE_ENA	(1 << 11)
-#define UCB_TC_B_MUTE		(1 << 13)
-#define UCB_TC_B_IN_ENA		(1 << 14)
-#define UCB_TC_B_OUT_ENA	(1 << 15)
-
-#define UCB_AC_A	0x07
-#define UCB_AC_B	0x08
-#define UCB_AC_B_LOOP		(1 << 8)
-#define UCB_AC_B_MUTE		(1 << 13)
-#define UCB_AC_B_IN_ENA		(1 << 14)
-#define UCB_AC_B_OUT_ENA	(1 << 15)
-
-#define UCB_TS_CR	0x09
-#define UCB_TS_CR_TSMX_POW	(1 << 0)
-#define UCB_TS_CR_TSPX_POW	(1 << 1)
-#define UCB_TS_CR_TSMY_POW	(1 << 2)
-#define UCB_TS_CR_TSPY_POW	(1 << 3)
-#define UCB_TS_CR_TSMX_GND	(1 << 4)
-#define UCB_TS_CR_TSPX_GND	(1 << 5)
-#define UCB_TS_CR_TSMY_GND	(1 << 6)
-#define UCB_TS_CR_TSPY_GND	(1 << 7)
-#define UCB_TS_CR_MODE_INT	(0 << 8)
-#define UCB_TS_CR_MODE_PRES	(1 << 8)
-#define UCB_TS_CR_MODE_POS	(2 << 8)
-#define UCB_TS_CR_BIAS_ENA	(1 << 11)
-#define UCB_TS_CR_TSPX_LOW	(1 << 12)
-#define UCB_TS_CR_TSMX_LOW	(1 << 13)
-
-#define UCB_ADC_CR	0x0a
-#define UCB_ADC_SYNC_ENA	(1 << 0)
-#define UCB_ADC_VREFBYP_CON	(1 << 1)
-#define UCB_ADC_INP_TSPX	(0 << 2)
-#define UCB_ADC_INP_TSMX	(1 << 2)
-#define UCB_ADC_INP_TSPY	(2 << 2)
-#define UCB_ADC_INP_TSMY	(3 << 2)
-#define UCB_ADC_INP_AD0		(4 << 2)
-#define UCB_ADC_INP_AD1		(5 << 2)
-#define UCB_ADC_INP_AD2		(6 << 2)
-#define UCB_ADC_INP_AD3		(7 << 2)
-#define UCB_ADC_EXT_REF		(1 << 5)
-#define UCB_ADC_START		(1 << 7)
-#define UCB_ADC_ENA		(1 << 15)
-
-#define UCB_ADC_DATA	0x0b
-#define UCB_ADC_DAT_VAL		(1 << 15)
-#define UCB_ADC_DAT(x)		(((x) & 0x7fe0) >> 5)
-
-#define UCB_ID		0x0c
-#define UCB_ID_1200		0x1004
-#define UCB_ID_1300		0x1005
-#define UCB_ID_TC35143          0x9712
-
-#define UCB_MODE	0x0d
-#define UCB_MODE_DYN_VFLAG_ENA	(1 << 12)
-#define UCB_MODE_AUD_OFF_CAN	(1 << 13)
-
-#include "mcp.h"
-
-struct ucb1x00_irq {
-	void *devid;
-	void (*fn)(int, void *);
-};
-
-struct ucb1x00 {
-	spinlock_t		lock;
-	struct mcp		*mcp;
-	unsigned int		irq;
-	struct semaphore	adc_sem;
-	spinlock_t		io_lock;
-	u16			id;
-	u16			io_dir;
-	u16			io_out;
-	u16			adc_cr;
-	u16			irq_fal_enbl;
-	u16			irq_ris_enbl;
-	struct ucb1x00_irq	irq_handler[16];
-	struct device		dev;
-	struct list_head	node;
-	struct list_head	devs;
-};
-
-struct ucb1x00_driver;
-
-struct ucb1x00_dev {
-	struct list_head	dev_node;
-	struct list_head	drv_node;
-	struct ucb1x00		*ucb;
-	struct ucb1x00_driver	*drv;
-	void			*priv;
-};
-
-struct ucb1x00_driver {
-	struct list_head	node;
-	struct list_head	devs;
-	int	(*add)(struct ucb1x00_dev *dev);
-	void	(*remove)(struct ucb1x00_dev *dev);
-	int	(*suspend)(struct ucb1x00_dev *dev, pm_message_t state);
-	int	(*resume)(struct ucb1x00_dev *dev);
-};
-
-#define classdev_to_ucb1x00(cd)	container_of(cd, struct ucb1x00, dev)
-
-int ucb1x00_register_driver(struct ucb1x00_driver *);
-void ucb1x00_unregister_driver(struct ucb1x00_driver *);
-
-/**
- *	ucb1x00_clkrate - return the UCB1x00 SIB clock rate
- *	@ucb: UCB1x00 structure describing chip
- *
- *	Return the SIB clock rate in Hz.
- */
-static inline unsigned int ucb1x00_clkrate(struct ucb1x00 *ucb)
-{
-	return mcp_get_sclk_rate(ucb->mcp);
-}
-
-/**
- *	ucb1x00_enable - enable the UCB1x00 SIB clock
- *	@ucb: UCB1x00 structure describing chip
- *
- *	Enable the SIB clock.  This can be called multiple times.
- */
-static inline void ucb1x00_enable(struct ucb1x00 *ucb)
-{
-	mcp_enable(ucb->mcp);
-}
-
-/**
- *	ucb1x00_disable - disable the UCB1x00 SIB clock
- *	@ucb: UCB1x00 structure describing chip
- *
- *	Disable the SIB clock.  The SIB clock will only be disabled
- *	when the number of ucb1x00_enable calls match the number of
- *	ucb1x00_disable calls.
- */
-static inline void ucb1x00_disable(struct ucb1x00 *ucb)
-{
-	mcp_disable(ucb->mcp);
-}
-
-/**
- *	ucb1x00_reg_write - write a UCB1x00 register
- *	@ucb: UCB1x00 structure describing chip
- *	@reg: UCB1x00 4-bit register index to write
- *	@val: UCB1x00 16-bit value to write
- *
- *	Write the UCB1x00 register @reg with value @val.  The SIB
- *	clock must be running for this function to return.
- */
-static inline void ucb1x00_reg_write(struct ucb1x00 *ucb, unsigned int reg, unsigned int val)
-{
-	mcp_reg_write(ucb->mcp, reg, val);
-}
-
-/**
- *	ucb1x00_reg_read - read a UCB1x00 register
- *	@ucb: UCB1x00 structure describing chip
- *	@reg: UCB1x00 4-bit register index to write
- *
- *	Read the UCB1x00 register @reg and return its value.  The SIB
- *	clock must be running for this function to return.
- */
-static inline unsigned int ucb1x00_reg_read(struct ucb1x00 *ucb, unsigned int reg)
-{
-	return mcp_reg_read(ucb->mcp, reg);
-}
-/**
- *	ucb1x00_set_audio_divisor - 
- *	@ucb: UCB1x00 structure describing chip
- *	@div: SIB clock divisor
- */
-static inline void ucb1x00_set_audio_divisor(struct ucb1x00 *ucb, unsigned int div)
-{
-	mcp_set_audio_divisor(ucb->mcp, div);
-}
-
-/**
- *	ucb1x00_set_telecom_divisor -
- *	@ucb: UCB1x00 structure describing chip
- *	@div: SIB clock divisor
- */
-static inline void ucb1x00_set_telecom_divisor(struct ucb1x00 *ucb, unsigned int div)
-{
-	mcp_set_telecom_divisor(ucb->mcp, div);
-}
-
-void ucb1x00_io_set_dir(struct ucb1x00 *ucb, unsigned int, unsigned int);
-void ucb1x00_io_write(struct ucb1x00 *ucb, unsigned int, unsigned int);
-unsigned int ucb1x00_io_read(struct ucb1x00 *ucb);
-
-#define UCB_NOSYNC	(0)
-#define UCB_SYNC	(1)
-
-unsigned int ucb1x00_adc_read(struct ucb1x00 *ucb, int adc_channel, int sync);
-void ucb1x00_adc_enable(struct ucb1x00 *ucb);
-void ucb1x00_adc_disable(struct ucb1x00 *ucb);
-
-/*
- * Which edges of the IRQ do you want to control today?
- */
-#define UCB_RISING	(1 << 0)
-#define UCB_FALLING	(1 << 1)
-
-int ucb1x00_hook_irq(struct ucb1x00 *ucb, unsigned int idx, void (*fn)(int, void *), void *devid);
-void ucb1x00_enable_irq(struct ucb1x00 *ucb, unsigned int idx, int edges);
-void ucb1x00_disable_irq(struct ucb1x00 *ucb, unsigned int idx, int edges);
-int ucb1x00_free_irq(struct ucb1x00 *ucb, unsigned int idx, void *devid);
-
-#endif
diff --git a/include/linux/mfd/mcp.h b/include/linux/mfd/mcp.h
new file mode 100644
index 000000000000..be95e09fd746
--- /dev/null
+++ b/include/linux/mfd/mcp.h
@@ -0,0 +1,68 @@
+/*
+ *  linux/drivers/mfd/mcp.h
+ *
+ *  Copyright (C) 2001 Russell King, All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ */
+#ifndef MCP_H
+#define MCP_H
+
+#include <mach/dma.h>
+
+struct mcp_ops;
+
+struct mcp {
+	struct module	*owner;
+	struct mcp_ops	*ops;
+	spinlock_t	lock;
+	int		use_count;
+	unsigned int	sclk_rate;
+	unsigned int	rw_timeout;
+	dma_device_t	dma_audio_rd;
+	dma_device_t	dma_audio_wr;
+	dma_device_t	dma_telco_rd;
+	dma_device_t	dma_telco_wr;
+	struct device	attached_device;
+};
+
+struct mcp_ops {
+	void		(*set_telecom_divisor)(struct mcp *, unsigned int);
+	void		(*set_audio_divisor)(struct mcp *, unsigned int);
+	void		(*reg_write)(struct mcp *, unsigned int, unsigned int);
+	unsigned int	(*reg_read)(struct mcp *, unsigned int);
+	void		(*enable)(struct mcp *);
+	void		(*disable)(struct mcp *);
+};
+
+void mcp_set_telecom_divisor(struct mcp *, unsigned int);
+void mcp_set_audio_divisor(struct mcp *, unsigned int);
+void mcp_reg_write(struct mcp *, unsigned int, unsigned int);
+unsigned int mcp_reg_read(struct mcp *, unsigned int);
+void mcp_enable(struct mcp *);
+void mcp_disable(struct mcp *);
+#define mcp_get_sclk_rate(mcp)	((mcp)->sclk_rate)
+
+struct mcp *mcp_host_alloc(struct device *, size_t);
+int mcp_host_register(struct mcp *);
+void mcp_host_unregister(struct mcp *);
+
+struct mcp_driver {
+	struct device_driver drv;
+	int (*probe)(struct mcp *);
+	void (*remove)(struct mcp *);
+	int (*suspend)(struct mcp *, pm_message_t);
+	int (*resume)(struct mcp *);
+};
+
+int mcp_driver_register(struct mcp_driver *);
+void mcp_driver_unregister(struct mcp_driver *);
+
+#define mcp_get_drvdata(mcp)	dev_get_drvdata(&(mcp)->attached_device)
+#define mcp_set_drvdata(mcp,d)	dev_set_drvdata(&(mcp)->attached_device, d)
+
+#define mcp_priv(mcp)		((void *)((mcp)+1))
+
+#endif
diff --git a/include/linux/mfd/ucb1x00.h b/include/linux/mfd/ucb1x00.h
new file mode 100644
index 000000000000..eac346336382
--- /dev/null
+++ b/include/linux/mfd/ucb1x00.h
@@ -0,0 +1,255 @@
+/*
+ *  linux/include/mfd/ucb1x00.h
+ *
+ *  Copyright (C) 2001 Russell King, All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ */
+#ifndef UCB1200_H
+#define UCB1200_H
+
+#include <linux/mfd/mcp.h>
+#define UCB_IO_DATA	0x00
+#define UCB_IO_DIR	0x01
+
+#define UCB_IO_0		(1 << 0)
+#define UCB_IO_1		(1 << 1)
+#define UCB_IO_2		(1 << 2)
+#define UCB_IO_3		(1 << 3)
+#define UCB_IO_4		(1 << 4)
+#define UCB_IO_5		(1 << 5)
+#define UCB_IO_6		(1 << 6)
+#define UCB_IO_7		(1 << 7)
+#define UCB_IO_8		(1 << 8)
+#define UCB_IO_9		(1 << 9)
+
+#define UCB_IE_RIS	0x02
+#define UCB_IE_FAL	0x03
+#define UCB_IE_STATUS	0x04
+#define UCB_IE_CLEAR	0x04
+#define UCB_IE_ADC		(1 << 11)
+#define UCB_IE_TSPX		(1 << 12)
+#define UCB_IE_TSMX		(1 << 13)
+#define UCB_IE_TCLIP		(1 << 14)
+#define UCB_IE_ACLIP		(1 << 15)
+
+#define UCB_IRQ_TSPX		12
+
+#define UCB_TC_A	0x05
+#define UCB_TC_A_LOOP		(1 << 7)	/* UCB1200 */
+#define UCB_TC_A_AMPL		(1 << 7)	/* UCB1300 */
+
+#define UCB_TC_B	0x06
+#define UCB_TC_B_VOICE_ENA	(1 << 3)
+#define UCB_TC_B_CLIP		(1 << 4)
+#define UCB_TC_B_ATT		(1 << 6)
+#define UCB_TC_B_SIDE_ENA	(1 << 11)
+#define UCB_TC_B_MUTE		(1 << 13)
+#define UCB_TC_B_IN_ENA		(1 << 14)
+#define UCB_TC_B_OUT_ENA	(1 << 15)
+
+#define UCB_AC_A	0x07
+#define UCB_AC_B	0x08
+#define UCB_AC_B_LOOP		(1 << 8)
+#define UCB_AC_B_MUTE		(1 << 13)
+#define UCB_AC_B_IN_ENA		(1 << 14)
+#define UCB_AC_B_OUT_ENA	(1 << 15)
+
+#define UCB_TS_CR	0x09
+#define UCB_TS_CR_TSMX_POW	(1 << 0)
+#define UCB_TS_CR_TSPX_POW	(1 << 1)
+#define UCB_TS_CR_TSMY_POW	(1 << 2)
+#define UCB_TS_CR_TSPY_POW	(1 << 3)
+#define UCB_TS_CR_TSMX_GND	(1 << 4)
+#define UCB_TS_CR_TSPX_GND	(1 << 5)
+#define UCB_TS_CR_TSMY_GND	(1 << 6)
+#define UCB_TS_CR_TSPY_GND	(1 << 7)
+#define UCB_TS_CR_MODE_INT	(0 << 8)
+#define UCB_TS_CR_MODE_PRES	(1 << 8)
+#define UCB_TS_CR_MODE_POS	(2 << 8)
+#define UCB_TS_CR_BIAS_ENA	(1 << 11)
+#define UCB_TS_CR_TSPX_LOW	(1 << 12)
+#define UCB_TS_CR_TSMX_LOW	(1 << 13)
+
+#define UCB_ADC_CR	0x0a
+#define UCB_ADC_SYNC_ENA	(1 << 0)
+#define UCB_ADC_VREFBYP_CON	(1 << 1)
+#define UCB_ADC_INP_TSPX	(0 << 2)
+#define UCB_ADC_INP_TSMX	(1 << 2)
+#define UCB_ADC_INP_TSPY	(2 << 2)
+#define UCB_ADC_INP_TSMY	(3 << 2)
+#define UCB_ADC_INP_AD0		(4 << 2)
+#define UCB_ADC_INP_AD1		(5 << 2)
+#define UCB_ADC_INP_AD2		(6 << 2)
+#define UCB_ADC_INP_AD3		(7 << 2)
+#define UCB_ADC_EXT_REF		(1 << 5)
+#define UCB_ADC_START		(1 << 7)
+#define UCB_ADC_ENA		(1 << 15)
+
+#define UCB_ADC_DATA	0x0b
+#define UCB_ADC_DAT_VAL		(1 << 15)
+#define UCB_ADC_DAT(x)		(((x) & 0x7fe0) >> 5)
+
+#define UCB_ID		0x0c
+#define UCB_ID_1200		0x1004
+#define UCB_ID_1300		0x1005
+#define UCB_ID_TC35143          0x9712
+
+#define UCB_MODE	0x0d
+#define UCB_MODE_DYN_VFLAG_ENA	(1 << 12)
+#define UCB_MODE_AUD_OFF_CAN	(1 << 13)
+
+
+struct ucb1x00_irq {
+	void *devid;
+	void (*fn)(int, void *);
+};
+
+struct ucb1x00 {
+	spinlock_t		lock;
+	struct mcp		*mcp;
+	unsigned int		irq;
+	struct semaphore	adc_sem;
+	spinlock_t		io_lock;
+	u16			id;
+	u16			io_dir;
+	u16			io_out;
+	u16			adc_cr;
+	u16			irq_fal_enbl;
+	u16			irq_ris_enbl;
+	struct ucb1x00_irq	irq_handler[16];
+	struct device		dev;
+	struct list_head	node;
+	struct list_head	devs;
+};
+
+struct ucb1x00_driver;
+
+struct ucb1x00_dev {
+	struct list_head	dev_node;
+	struct list_head	drv_node;
+	struct ucb1x00		*ucb;
+	struct ucb1x00_driver	*drv;
+	void			*priv;
+};
+
+struct ucb1x00_driver {
+	struct list_head	node;
+	struct list_head	devs;
+	int	(*add)(struct ucb1x00_dev *dev);
+	void	(*remove)(struct ucb1x00_dev *dev);
+	int	(*suspend)(struct ucb1x00_dev *dev, pm_message_t state);
+	int	(*resume)(struct ucb1x00_dev *dev);
+};
+
+#define classdev_to_ucb1x00(cd)	container_of(cd, struct ucb1x00, dev)
+
+int ucb1x00_register_driver(struct ucb1x00_driver *);
+void ucb1x00_unregister_driver(struct ucb1x00_driver *);
+
+/**
+ *	ucb1x00_clkrate - return the UCB1x00 SIB clock rate
+ *	@ucb: UCB1x00 structure describing chip
+ *
+ *	Return the SIB clock rate in Hz.
+ */
+static inline unsigned int ucb1x00_clkrate(struct ucb1x00 *ucb)
+{
+	return mcp_get_sclk_rate(ucb->mcp);
+}
+
+/**
+ *	ucb1x00_enable - enable the UCB1x00 SIB clock
+ *	@ucb: UCB1x00 structure describing chip
+ *
+ *	Enable the SIB clock.  This can be called multiple times.
+ */
+static inline void ucb1x00_enable(struct ucb1x00 *ucb)
+{
+	mcp_enable(ucb->mcp);
+}
+
+/**
+ *	ucb1x00_disable - disable the UCB1x00 SIB clock
+ *	@ucb: UCB1x00 structure describing chip
+ *
+ *	Disable the SIB clock.  The SIB clock will only be disabled
+ *	when the number of ucb1x00_enable calls match the number of
+ *	ucb1x00_disable calls.
+ */
+static inline void ucb1x00_disable(struct ucb1x00 *ucb)
+{
+	mcp_disable(ucb->mcp);
+}
+
+/**
+ *	ucb1x00_reg_write - write a UCB1x00 register
+ *	@ucb: UCB1x00 structure describing chip
+ *	@reg: UCB1x00 4-bit register index to write
+ *	@val: UCB1x00 16-bit value to write
+ *
+ *	Write the UCB1x00 register @reg with value @val.  The SIB
+ *	clock must be running for this function to return.
+ */
+static inline void ucb1x00_reg_write(struct ucb1x00 *ucb, unsigned int reg, unsigned int val)
+{
+	mcp_reg_write(ucb->mcp, reg, val);
+}
+
+/**
+ *	ucb1x00_reg_read - read a UCB1x00 register
+ *	@ucb: UCB1x00 structure describing chip
+ *	@reg: UCB1x00 4-bit register index to write
+ *
+ *	Read the UCB1x00 register @reg and return its value.  The SIB
+ *	clock must be running for this function to return.
+ */
+static inline unsigned int ucb1x00_reg_read(struct ucb1x00 *ucb, unsigned int reg)
+{
+	return mcp_reg_read(ucb->mcp, reg);
+}
+/**
+ *	ucb1x00_set_audio_divisor - 
+ *	@ucb: UCB1x00 structure describing chip
+ *	@div: SIB clock divisor
+ */
+static inline void ucb1x00_set_audio_divisor(struct ucb1x00 *ucb, unsigned int div)
+{
+	mcp_set_audio_divisor(ucb->mcp, div);
+}
+
+/**
+ *	ucb1x00_set_telecom_divisor -
+ *	@ucb: UCB1x00 structure describing chip
+ *	@div: SIB clock divisor
+ */
+static inline void ucb1x00_set_telecom_divisor(struct ucb1x00 *ucb, unsigned int div)
+{
+	mcp_set_telecom_divisor(ucb->mcp, div);
+}
+
+void ucb1x00_io_set_dir(struct ucb1x00 *ucb, unsigned int, unsigned int);
+void ucb1x00_io_write(struct ucb1x00 *ucb, unsigned int, unsigned int);
+unsigned int ucb1x00_io_read(struct ucb1x00 *ucb);
+
+#define UCB_NOSYNC	(0)
+#define UCB_SYNC	(1)
+
+unsigned int ucb1x00_adc_read(struct ucb1x00 *ucb, int adc_channel, int sync);
+void ucb1x00_adc_enable(struct ucb1x00 *ucb);
+void ucb1x00_adc_disable(struct ucb1x00 *ucb);
+
+/*
+ * Which edges of the IRQ do you want to control today?
+ */
+#define UCB_RISING	(1 << 0)
+#define UCB_FALLING	(1 << 1)
+
+int ucb1x00_hook_irq(struct ucb1x00 *ucb, unsigned int idx, void (*fn)(int, void *), void *devid);
+void ucb1x00_enable_irq(struct ucb1x00 *ucb, unsigned int idx, int edges);
+void ucb1x00_disable_irq(struct ucb1x00 *ucb, unsigned int idx, int edges);
+int ucb1x00_free_irq(struct ucb1x00 *ucb, unsigned int idx, void *devid);
+
+#endif
-- 
cgit v1.2.3


From 9ca3dc805cd0d89c44f88b9a399061946781323a Mon Sep 17 00:00:00 2001
From: Thomas Kunze <thommycheck@gmx.de>
Date: Tue, 10 Feb 2009 14:50:56 +0100
Subject: add gpiolib support to ucb1x00

The old access methods to the gpios will be removed when
all users has been converted. (mainly ucb1x00-ts)
---
 arch/arm/mach-sa1100/include/mach/mcp.h |  1 +
 drivers/mfd/mcp-sa11x0.c                |  1 +
 drivers/mfd/ucb1x00-core.c              | 87 ++++++++++++++++++++++++++++++++-
 include/linux/mfd/mcp.h                 |  1 +
 include/linux/mfd/ucb1x00.h             |  3 ++
 5 files changed, 91 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-sa1100/include/mach/mcp.h b/arch/arm/mach-sa1100/include/mach/mcp.h
index fb8b09a57ad7..ed1a331508a7 100644
--- a/arch/arm/mach-sa1100/include/mach/mcp.h
+++ b/arch/arm/mach-sa1100/include/mach/mcp.h
@@ -16,6 +16,7 @@ struct mcp_plat_data {
 	u32 mccr0;
 	u32 mccr1;
 	unsigned int sclk_rate;
+	int gpio_base;
 };
 
 #endif
diff --git a/drivers/mfd/mcp-sa11x0.c b/drivers/mfd/mcp-sa11x0.c
index 212189815c8e..258427232728 100644
--- a/drivers/mfd/mcp-sa11x0.c
+++ b/drivers/mfd/mcp-sa11x0.c
@@ -163,6 +163,7 @@ static int mcp_sa11x0_probe(struct platform_device *pdev)
 	mcp->dma_audio_wr	= DMA_Ser4MCP0Wr;
 	mcp->dma_telco_rd	= DMA_Ser4MCP1Rd;
 	mcp->dma_telco_wr	= DMA_Ser4MCP1Wr;
+	mcp->gpio_base		= data->gpio_base;
 
 	platform_set_drvdata(pdev, mcp);
 
diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c
index f9de7891e57f..252b74188ec2 100644
--- a/drivers/mfd/ucb1x00-core.c
+++ b/drivers/mfd/ucb1x00-core.c
@@ -26,11 +26,11 @@
 #include <linux/device.h>
 #include <linux/mutex.h>
 #include <linux/mfd/ucb1x00.h>
+#include <linux/gpio.h>
 
 #include <mach/dma.h>
 #include <mach/hardware.h>
 
-
 static DEFINE_MUTEX(ucb1x00_mutex);
 static LIST_HEAD(ucb1x00_drivers);
 static LIST_HEAD(ucb1x00_devices);
@@ -108,6 +108,60 @@ unsigned int ucb1x00_io_read(struct ucb1x00 *ucb)
 	return ucb1x00_reg_read(ucb, UCB_IO_DATA);
 }
 
+static void ucb1x00_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
+{
+	struct ucb1x00 *ucb = container_of(chip, struct ucb1x00, gpio);
+	unsigned long flags;
+
+	spin_lock_irqsave(&ucb->io_lock, flags);
+	if (value)
+		ucb->io_out |= 1 << offset;
+	else
+		ucb->io_out &= ~(1 << offset);
+
+	ucb1x00_reg_write(ucb, UCB_IO_DATA, ucb->io_out);
+	spin_unlock_irqrestore(&ucb->io_lock, flags);
+}
+
+static int ucb1x00_gpio_get(struct gpio_chip *chip, unsigned offset)
+{
+	struct ucb1x00 *ucb = container_of(chip, struct ucb1x00, gpio);
+	return ucb1x00_reg_read(ucb, UCB_IO_DATA) & (1 << offset);
+}
+
+static int ucb1x00_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
+{
+	struct ucb1x00 *ucb = container_of(chip, struct ucb1x00, gpio);
+	unsigned long flags;
+
+	spin_lock_irqsave(&ucb->io_lock, flags);
+	ucb->io_dir &= ~(1 << offset);
+	ucb1x00_reg_write(ucb, UCB_IO_DIR, ucb->io_dir);
+	spin_unlock_irqrestore(&ucb->io_lock, flags);
+
+	return 0;
+}
+
+static int ucb1x00_gpio_direction_output(struct gpio_chip *chip, unsigned offset
+		, int value)
+{
+	struct ucb1x00 *ucb = container_of(chip, struct ucb1x00, gpio);
+	unsigned long flags;
+
+	spin_lock_irqsave(&ucb->io_lock, flags);
+	ucb->io_dir |= (1 << offset);
+	ucb1x00_reg_write(ucb, UCB_IO_DIR, ucb->io_dir);
+
+	if (value)
+		ucb->io_out |= 1 << offset;
+	else
+		ucb->io_out &= ~(1 << offset);
+	ucb1x00_reg_write(ucb, UCB_IO_DATA, ucb->io_out);
+	spin_unlock_irqrestore(&ucb->io_lock, flags);
+
+	return 0;
+}
+
 /*
  * UCB1300 data sheet says we must:
  *  1. enable ADC	=> 5us (including reference startup time)
@@ -476,6 +530,7 @@ static int ucb1x00_probe(struct mcp *mcp)
 	struct ucb1x00_driver *drv;
 	unsigned int id;
 	int ret = -ENODEV;
+	int temp;
 
 	mcp_enable(mcp);
 	id = mcp_reg_read(mcp, UCB_ID);
@@ -508,12 +563,27 @@ static int ucb1x00_probe(struct mcp *mcp)
 		goto err_free;
 	}
 
+	ucb->gpio.base = -1;
+	if (mcp->gpio_base != 0) {
+		ucb->gpio.label = dev_name(&ucb->dev);
+		ucb->gpio.base = mcp->gpio_base;
+		ucb->gpio.ngpio = 10;
+		ucb->gpio.set = ucb1x00_gpio_set;
+		ucb->gpio.get = ucb1x00_gpio_get;
+		ucb->gpio.direction_input = ucb1x00_gpio_direction_input;
+		ucb->gpio.direction_output = ucb1x00_gpio_direction_output;
+		ret = gpiochip_add(&ucb->gpio);
+		if (ret)
+			goto err_free;
+	} else
+		dev_info(&ucb->dev, "gpio_base not set so no gpiolib support");
+
 	ret = request_irq(ucb->irq, ucb1x00_irq, IRQF_TRIGGER_RISING,
 			  "UCB1x00", ucb);
 	if (ret) {
 		printk(KERN_ERR "ucb1x00: unable to grab irq%d: %d\n",
 			ucb->irq, ret);
-		goto err_free;
+		goto err_gpio;
 	}
 
 	mcp_set_drvdata(mcp, ucb);
@@ -522,6 +592,7 @@ static int ucb1x00_probe(struct mcp *mcp)
 	if (ret)
 		goto err_irq;
 
+
 	INIT_LIST_HEAD(&ucb->devs);
 	mutex_lock(&ucb1x00_mutex);
 	list_add(&ucb->node, &ucb1x00_devices);
@@ -529,10 +600,14 @@ static int ucb1x00_probe(struct mcp *mcp)
 		ucb1x00_add_dev(ucb, drv);
 	}
 	mutex_unlock(&ucb1x00_mutex);
+
 	goto out;
 
  err_irq:
 	free_irq(ucb->irq, ucb);
+ err_gpio:
+	if (ucb->gpio.base != -1)
+		temp = gpiochip_remove(&ucb->gpio);
  err_free:
 	kfree(ucb);
  err_disable:
@@ -545,6 +620,7 @@ static void ucb1x00_remove(struct mcp *mcp)
 {
 	struct ucb1x00 *ucb = mcp_get_drvdata(mcp);
 	struct list_head *l, *n;
+	int ret;
 
 	mutex_lock(&ucb1x00_mutex);
 	list_del(&ucb->node);
@@ -554,6 +630,12 @@ static void ucb1x00_remove(struct mcp *mcp)
 	}
 	mutex_unlock(&ucb1x00_mutex);
 
+	if (ucb->gpio.base != -1) {
+		ret = gpiochip_remove(&ucb->gpio);
+		if (ret)
+			dev_err(&ucb->dev, "Can't remove gpio chip: %d\n", ret);
+	}
+
 	free_irq(ucb->irq, ucb);
 	device_unregister(&ucb->dev);
 }
@@ -604,6 +686,7 @@ static int ucb1x00_resume(struct mcp *mcp)
 	struct ucb1x00 *ucb = mcp_get_drvdata(mcp);
 	struct ucb1x00_dev *dev;
 
+	ucb1x00_reg_write(ucb, UCB_IO_DIR, ucb->io_dir);
 	mutex_lock(&ucb1x00_mutex);
 	list_for_each_entry(dev, &ucb->devs, dev_node) {
 		if (dev->drv->resume)
diff --git a/include/linux/mfd/mcp.h b/include/linux/mfd/mcp.h
index be95e09fd746..ee496708e38b 100644
--- a/include/linux/mfd/mcp.h
+++ b/include/linux/mfd/mcp.h
@@ -26,6 +26,7 @@ struct mcp {
 	dma_device_t	dma_telco_rd;
 	dma_device_t	dma_telco_wr;
 	struct device	attached_device;
+	int		gpio_base;
 };
 
 struct mcp_ops {
diff --git a/include/linux/mfd/ucb1x00.h b/include/linux/mfd/ucb1x00.h
index eac346336382..aa9c3789bed4 100644
--- a/include/linux/mfd/ucb1x00.h
+++ b/include/linux/mfd/ucb1x00.h
@@ -11,6 +11,8 @@
 #define UCB1200_H
 
 #include <linux/mfd/mcp.h>
+#include <linux/gpio.h>
+
 #define UCB_IO_DATA	0x00
 #define UCB_IO_DIR	0x01
 
@@ -123,6 +125,7 @@ struct ucb1x00 {
 	struct device		dev;
 	struct list_head	node;
 	struct list_head	devs;
+	struct gpio_chip 	gpio;
 };
 
 struct ucb1x00_driver;
-- 
cgit v1.2.3


From 67fbb16be69d138a3b6645ec5395b487cb915c58 Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <sameo@linux.intel.com>
Date: Tue, 24 Nov 2009 23:59:15 +0100
Subject: nl80211: PMKSA caching support

This is an interface to set, delete and flush PMKIDs through nl80211.
Main users would be fullmac devices which firmwares are capable of
generating the RSN IEs for the re-association requests, e.g. iwmc3200wifi.

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/ieee80211.h |   2 +
 include/linux/nl80211.h   |  11 +++++
 include/net/cfg80211.h    |  28 +++++++++++
 net/wireless/nl80211.c    | 120 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 161 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index afa8e0ac27a7..d9724a28c0c2 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1266,6 +1266,8 @@ enum ieee80211_sa_query_action {
 
 #define WLAN_MAX_KEY_LEN		32
 
+#define WLAN_PMKID_LEN			16
+
 /**
  * ieee80211_get_qos_ctl - get pointer to qos control bytes
  * @hdr: the frame
diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 45db17f81aa3..da8ea2e19273 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -349,6 +349,10 @@ enum nl80211_commands {
 	NL80211_CMD_GET_SURVEY,
 	NL80211_CMD_NEW_SURVEY_RESULTS,
 
+	NL80211_CMD_SET_PMKSA,
+	NL80211_CMD_DEL_PMKSA,
+	NL80211_CMD_FLUSH_PMKSA,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -598,6 +602,10 @@ enum nl80211_commands {
  *      the survey response for %NL80211_CMD_GET_SURVEY, nested attribute
  *      containing info as possible, see &enum survey_info.
  *
+ * @NL80211_ATTR_PMKID: PMK material for PMKSA caching.
+ * @NL80211_ATTR_MAX_NUM_PMKIDS: maximum number of PMKIDs a firmware can
+ *	cache, a wiphy attribute.
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -732,6 +740,9 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_SURVEY_INFO,
 
+	NL80211_ATTR_PMKID,
+	NL80211_ATTR_MAX_NUM_PMKIDS,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index a6492e9bca97..0884b9a0f778 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -871,6 +871,19 @@ struct cfg80211_bitrate_mask {
 	u32 fixed;   /* fixed bitrate, 0 == not fixed */
 	u32 maxrate; /* in kbps, 0 == no limit */
 };
+/**
+ * struct cfg80211_pmksa - PMK Security Association
+ *
+ * This structure is passed to the set/del_pmksa() method for PMKSA
+ * caching.
+ *
+ * @bssid: The AP's BSSID.
+ * @pmkid: The PMK material itself.
+ */
+struct cfg80211_pmksa {
+	u8 *bssid;
+	u8 *pmkid;
+};
 
 /**
  * struct cfg80211_ops - backend description for wireless configuration
@@ -976,6 +989,13 @@ struct cfg80211_bitrate_mask {
  * @dump_survey: get site survey information.
  *
  * @testmode_cmd: run a test mode command
+ *
+ * @set_pmksa: Cache a PMKID for a BSSID. This is mostly useful for fullmac
+ *	devices running firmwares capable of generating the (re) association
+ *	RSN IE. It allows for faster roaming between WPA2 BSSIDs.
+ * @del_pmksa: Delete a cached PMKID.
+ * @flush_pmksa: Flush all cached PMKIDs.
+ *
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy);
@@ -1097,6 +1117,12 @@ struct cfg80211_ops {
 	int	(*dump_survey)(struct wiphy *wiphy, struct net_device *netdev,
 			int idx, struct survey_info *info);
 
+	int	(*set_pmksa)(struct wiphy *wiphy, struct net_device *netdev,
+			     struct cfg80211_pmksa *pmksa);
+	int	(*del_pmksa)(struct wiphy *wiphy, struct net_device *netdev,
+			     struct cfg80211_pmksa *pmksa);
+	int	(*flush_pmksa)(struct wiphy *wiphy, struct net_device *netdev);
+
 	/* some temporary stuff to finish wext */
 	int	(*set_power_mgmt)(struct wiphy *wiphy, struct net_device *dev,
 				  bool enabled, int timeout);
@@ -1195,6 +1221,8 @@ struct wiphy {
 	char fw_version[ETHTOOL_BUSINFO_LEN];
 	u32 hw_version;
 
+	u8 max_num_pmkids;
+
 	/* If multiple wiphys are registered and you're handed e.g.
 	 * a regular netdev with assigned ieee80211_ptr, you won't
 	 * know whether it points to a wiphy your driver has registered
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 149539ade15e..a6028433e3a0 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -139,6 +139,8 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = {
 	[NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 },
 	[NL80211_ATTR_PID] = { .type = NLA_U32 },
 	[NL80211_ATTR_4ADDR] = { .type = NLA_U8 },
+	[NL80211_ATTR_PMKID] = { .type = NLA_BINARY,
+				 .len = WLAN_PMKID_LEN },
 };
 
 /* policy for the attributes */
@@ -450,6 +452,9 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 		sizeof(u32) * dev->wiphy.n_cipher_suites,
 		dev->wiphy.cipher_suites);
 
+	NLA_PUT_U8(msg, NL80211_ATTR_MAX_NUM_PMKIDS,
+		   dev->wiphy.max_num_pmkids);
+
 	nl_modes = nla_nest_start(msg, NL80211_ATTR_SUPPORTED_IFTYPES);
 	if (!nl_modes)
 		goto nla_put_failure;
@@ -561,6 +566,9 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 	CMD(deauth, DEAUTHENTICATE);
 	CMD(disassoc, DISASSOCIATE);
 	CMD(join_ibss, JOIN_IBSS);
+	CMD(set_pmksa, SET_PMKSA);
+	CMD(del_pmksa, DEL_PMKSA);
+	CMD(flush_pmksa, FLUSH_PMKSA);
 	if (dev->wiphy.flags & WIPHY_FLAG_NETNS_OK) {
 		i++;
 		NLA_PUT_U32(msg, i, NL80211_CMD_SET_WIPHY_NETNS);
@@ -4221,6 +4229,99 @@ static int nl80211_wiphy_netns(struct sk_buff *skb, struct genl_info *info)
 	return err;
 }
 
+static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev;
+	int (*rdev_ops)(struct wiphy *wiphy, struct net_device *dev,
+			struct cfg80211_pmksa *pmksa) = NULL;
+	int err;
+	struct net_device *dev;
+	struct cfg80211_pmksa pmksa;
+
+	memset(&pmksa, 0, sizeof(struct cfg80211_pmksa));
+
+	if (!info->attrs[NL80211_ATTR_MAC])
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_PMKID])
+		return -EINVAL;
+
+	rtnl_lock();
+
+	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
+	if (err)
+		goto out_rtnl;
+
+	pmksa.pmkid = nla_data(info->attrs[NL80211_ATTR_PMKID]);
+	pmksa.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	switch (info->genlhdr->cmd) {
+	case NL80211_CMD_SET_PMKSA:
+		rdev_ops = rdev->ops->set_pmksa;
+		break;
+	case NL80211_CMD_DEL_PMKSA:
+		rdev_ops = rdev->ops->del_pmksa;
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
+
+	if (!rdev_ops) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	err = rdev_ops(&rdev->wiphy, dev, &pmksa);
+
+ out:
+	cfg80211_unlock_rdev(rdev);
+	dev_put(dev);
+ out_rtnl:
+	rtnl_unlock();
+
+	return err;
+}
+
+static int nl80211_flush_pmksa(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev;
+	int err;
+	struct net_device *dev;
+
+	rtnl_lock();
+
+	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
+	if (err)
+		goto out_rtnl;
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (!rdev->ops->flush_pmksa) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	err = rdev->ops->flush_pmksa(&rdev->wiphy, dev);
+
+ out:
+	cfg80211_unlock_rdev(rdev);
+	dev_put(dev);
+ out_rtnl:
+	rtnl_unlock();
+
+	return err;
+
+}
+
 static struct genl_ops nl80211_ops[] = {
 	{
 		.cmd = NL80211_CMD_GET_WIPHY,
@@ -4465,6 +4566,25 @@ static struct genl_ops nl80211_ops[] = {
 		.policy = nl80211_policy,
 		.dumpit = nl80211_dump_survey,
 	},
+	{
+		.cmd = NL80211_CMD_SET_PMKSA,
+		.doit = nl80211_setdel_pmksa,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = NL80211_CMD_DEL_PMKSA,
+		.doit = nl80211_setdel_pmksa,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = NL80211_CMD_FLUSH_PMKSA,
+		.doit = nl80211_flush_pmksa,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+
 };
 static struct genl_multicast_group nl80211_mlme_mcgrp = {
 	.name = "mlme",
-- 
cgit v1.2.3


From 5789d290cda6854b03986031df02b965572279df Mon Sep 17 00:00:00 2001
From: PJ Waskiewicz <peter.p.waskiewicz.jr@intel.com>
Date: Wed, 25 Nov 2009 00:11:30 +0000
Subject: ethtool: Add Direct Attach support to connector port reporting

This patch allows a base driver to specify Direct Attach as the
type of port through the ethtool interface.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index edd03b79e8f8..bcaa0e0a54d3 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -631,6 +631,8 @@ struct ethtool_ops {
 #define PORT_MII		0x02
 #define PORT_FIBRE		0x03
 #define PORT_BNC		0x04
+#define PORT_DA			0x05
+#define PORT_NONE		0xef
 #define PORT_OTHER		0xff
 
 /* Which transceiver to use. */
-- 
cgit v1.2.3


From 8e7cac79808b62f242069a6ac88d364d35621371 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 29 Nov 2009 16:34:48 +0200
Subject: core: Fix user return notifier on fork()

fork() clones all thread_info flags, including
TIF_USER_RETURN_NOTIFY; if the new task is first scheduled on a cpu
which doesn't have user return notifiers set, this causes user
return notifiers to trigger without any way of clearing itself.

This is easy to trigger with a forky workload on the host in
parallel with kvm, resulting in a cpu in an endless loop on the
verge of returning to userspace.

Fix by dropping the TIF_USER_RETURN_NOTIFY immediately after fork.

Signed-off-by: Avi Kivity <avi@redhat.com>
LKML-Reference: <1259505288-16559-1-git-send-email-avi@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/user-return-notifier.h | 7 +++++++
 kernel/fork.c                        | 2 ++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/user-return-notifier.h b/include/linux/user-return-notifier.h
index b6ac056291d7..9c4a445bb43c 100644
--- a/include/linux/user-return-notifier.h
+++ b/include/linux/user-return-notifier.h
@@ -26,6 +26,11 @@ static inline void propagate_user_return_notify(struct task_struct *prev,
 
 void fire_user_return_notifiers(void);
 
+static inline void clear_user_return_notifier(struct task_struct *p)
+{
+	clear_tsk_thread_flag(p, TIF_USER_RETURN_NOTIFY);
+}
+
 #else
 
 struct user_return_notifier {};
@@ -37,6 +42,8 @@ static inline void propagate_user_return_notify(struct task_struct *prev,
 
 static inline void fire_user_return_notifiers(void) {}
 
+static inline void clear_user_return_notifier(struct task_struct *p) {}
+
 #endif
 
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 266c6af6ef1b..1b7512d5a64a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -64,6 +64,7 @@
 #include <linux/magic.h>
 #include <linux/perf_event.h>
 #include <linux/posix-timers.h>
+#include <linux/user-return-notifier.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -249,6 +250,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 		goto out;
 
 	setup_thread_stack(tsk, orig);
+	clear_user_return_notifier(tsk);
 	stackend = end_of_stack(tsk);
 	*stackend = STACK_END_MAGIC;	/* for overflow detection */
 
-- 
cgit v1.2.3


From b3a8549593696f5f3efcdbf280e2c8e0fe894855 Mon Sep 17 00:00:00 2001
From: Haojian Zhuang <haojian.zhuang@marvell.com>
Date: Thu, 5 Nov 2009 10:27:13 -0500
Subject: backlight: da903x_bl: control WLED output current in da9034

Update WLED output current source before changing brightness.

Signed-off-by: Haojian Zhuang <haojian.zhuang@marvell.com>
Signed-off-by: Eric Miao <eric.y.miao@gmail.com>
---
 drivers/video/backlight/da903x_bl.c | 7 +++++++
 include/linux/mfd/da903x.h          | 4 ++++
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/video/backlight/da903x_bl.c b/drivers/video/backlight/da903x_bl.c
index 701a1081e199..7fcb0eb54c60 100644
--- a/drivers/video/backlight/da903x_bl.c
+++ b/drivers/video/backlight/da903x_bl.c
@@ -25,6 +25,7 @@
 
 #define DA9034_WLED_CONTROL1	0x3C
 #define DA9034_WLED_CONTROL2	0x3D
+#define DA9034_WLED_ISET(x)	((x) & 0x1f)
 
 #define DA9034_WLED_BOOST_EN	(1 << 5)
 
@@ -101,6 +102,7 @@ static struct backlight_ops da903x_backlight_ops = {
 
 static int da903x_backlight_probe(struct platform_device *pdev)
 {
+	struct da9034_backlight_pdata *pdata = pdev->dev.platform_data;
 	struct da903x_backlight_data *data;
 	struct backlight_device *bl;
 	int max_brightness;
@@ -127,6 +129,11 @@ static int da903x_backlight_probe(struct platform_device *pdev)
 	data->da903x_dev = pdev->dev.parent;
 	data->current_brightness = 0;
 
+	/* adjust the WLED output current */
+	if (pdata)
+		da903x_write(data->da903x_dev, DA9034_WLED_CONTROL2,
+				DA9034_WLED_ISET(pdata->output_current));
+
 	bl = backlight_device_register(pdev->name, data->da903x_dev,
 			data, &da903x_backlight_ops);
 	if (IS_ERR(bl)) {
diff --git a/include/linux/mfd/da903x.h b/include/linux/mfd/da903x.h
index c63b65c94429..0aa3a1a49ee3 100644
--- a/include/linux/mfd/da903x.h
+++ b/include/linux/mfd/da903x.h
@@ -96,6 +96,10 @@ struct da9034_touch_pdata {
 	int	y_inverted;
 };
 
+struct da9034_backlight_pdata {
+	int	output_current;	/* output current of WLED, from 0-31 (in mA) */
+};
+
 /* DA9030 battery charger data */
 struct power_supply_info;
 
-- 
cgit v1.2.3


From 7716977b6ae5a0cdd0afab5c6035c4d0ce53f599 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 30 Nov 2009 13:24:18 +0000
Subject: mfd: Correct WM831X_MAX_ISEL_VALUE

There was confusion between the array size and the highest ISEL
value possible.

Reported-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/wm831x-core.c            | 2 +-
 include/linux/mfd/wm831x/regulator.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/wm831x-core.c b/drivers/mfd/wm831x-core.c
index 49b7885c2702..7f27576ca046 100644
--- a/drivers/mfd/wm831x-core.c
+++ b/drivers/mfd/wm831x-core.c
@@ -29,7 +29,7 @@
 /* Current settings - values are 2*2^(reg_val/4) microamps.  These are
  * exported since they are used by multiple drivers.
  */
-int wm831x_isinkv_values[WM831X_ISINK_MAX_ISEL] = {
+int wm831x_isinkv_values[WM831X_ISINK_MAX_ISEL + 1] = {
 	2,
 	2,
 	3,
diff --git a/include/linux/mfd/wm831x/regulator.h b/include/linux/mfd/wm831x/regulator.h
index f95466343fb2..955d30fc6a27 100644
--- a/include/linux/mfd/wm831x/regulator.h
+++ b/include/linux/mfd/wm831x/regulator.h
@@ -1212,7 +1212,7 @@
 #define WM831X_LDO1_OK_SHIFT                         0  /* LDO1_OK */
 #define WM831X_LDO1_OK_WIDTH                         1  /* LDO1_OK */
 
-#define WM831X_ISINK_MAX_ISEL 56
-extern int wm831x_isinkv_values[WM831X_ISINK_MAX_ISEL];
+#define WM831X_ISINK_MAX_ISEL 55
+extern int wm831x_isinkv_values[WM831X_ISINK_MAX_ISEL + 1];
 
 #endif
-- 
cgit v1.2.3


From f13a48bd798a159291ca583b95453171b88b7448 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 1 Dec 2009 15:36:11 +0000
Subject: SLOW_WORK: Move slow_work's proc file to debugfs

Move slow_work's debugging proc file to debugfs.

Signed-off-by: David Howells <dhowells@redhat.com>
Requested-and-acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/slow-work.txt |   4 +-
 include/linux/slow-work.h   |   8 +-
 init/Kconfig                |   8 +-
 kernel/Makefile             |   2 +-
 kernel/slow-work-debugfs.c  | 227 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/slow-work-proc.c     | 227 --------------------------------------------
 kernel/slow-work.c          |  18 ++--
 kernel/slow-work.h          |   6 +-
 8 files changed, 253 insertions(+), 247 deletions(-)
 create mode 100644 kernel/slow-work-debugfs.c
 delete mode 100644 kernel/slow-work-proc.c

(limited to 'include/linux')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
index 52bc31433723..9dbf4470c7e1 100644
--- a/Documentation/slow-work.txt
+++ b/Documentation/slow-work.txt
@@ -279,9 +279,9 @@ The slow-work thread pool has a number of configurables:
 VIEWING EXECUTING AND QUEUED ITEMS
 ==================================
 
-If CONFIG_SLOW_WORK_PROC is enabled, a proc file is made available:
+If CONFIG_SLOW_WORK_DEBUG is enabled, a debugfs file is made available:
 
-	/proc/slow_work_rq
+	/sys/kernel/debug/slow_work/runqueue
 
 through which the list of work items being executed and the queues of items to
 be executed may be viewed.  The owner of a work item is given the chance to
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index 5035a2691739..13337bf6c3f5 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -20,7 +20,7 @@
 #include <linux/timer.h>
 
 struct slow_work;
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 struct seq_file;
 #endif
 
@@ -42,8 +42,8 @@ struct slow_work_ops {
 	/* execute a work item */
 	void (*execute)(struct slow_work *work);
 
-#ifdef CONFIG_SLOW_WORK_PROC
-	/* describe a work item for /proc */
+#ifdef CONFIG_SLOW_WORK_DEBUG
+	/* describe a work item for debugfs */
 	void (*desc)(struct slow_work *work, struct seq_file *m);
 #endif
 };
@@ -64,7 +64,7 @@ struct slow_work {
 #define SLOW_WORK_DELAYED	5	/* item is struct delayed_slow_work with active timer */
 	const struct slow_work_ops *ops; /* operations table for this item */
 	struct list_head	link;	/* link in queue */
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 	struct timespec		mark;	/* jiffies at which queued or exec begun */
 #endif
 };
diff --git a/init/Kconfig b/init/Kconfig
index ab5c64801fe5..39923ccc287b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1098,12 +1098,12 @@ config SLOW_WORK
 
 	  See Documentation/slow-work.txt.
 
-config SLOW_WORK_PROC
-	bool "Slow work debugging through /proc"
+config SLOW_WORK_DEBUG
+	bool "Slow work debugging through debugfs"
 	default n
-	depends on SLOW_WORK && PROC_FS
+	depends on SLOW_WORK && DEBUG_FS
 	help
-	  Display the contents of the slow work run queue through /proc,
+	  Display the contents of the slow work run queue through debugfs,
 	  including items currently executing.
 
 	  See Documentation/slow-work.txt.
diff --git a/kernel/Makefile b/kernel/Makefile
index 776ffed1556d..d7c13d249b2d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -94,7 +94,7 @@ obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_SLOW_WORK) += slow-work.o
-obj-$(CONFIG_SLOW_WORK_PROC) += slow-work-proc.o
+obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c
new file mode 100644
index 000000000000..e45c43645298
--- /dev/null
+++ b/kernel/slow-work-debugfs.c
@@ -0,0 +1,227 @@
+/* Slow work debugging
+ *
+ * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/slow-work.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/seq_file.h>
+#include "slow-work.h"
+
+#define ITERATOR_SHIFT		(BITS_PER_LONG - 4)
+#define ITERATOR_SELECTOR	(0xfUL << ITERATOR_SHIFT)
+#define ITERATOR_COUNTER	(~ITERATOR_SELECTOR)
+
+void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m)
+{
+	seq_puts(m, "Slow-work: New thread");
+}
+
+/*
+ * Render the time mark field on a work item into a 5-char time with units plus
+ * a space
+ */
+static void slow_work_print_mark(struct seq_file *m, struct slow_work *work)
+{
+	struct timespec now, diff;
+
+	now = CURRENT_TIME;
+	diff = timespec_sub(now, work->mark);
+
+	if (diff.tv_sec < 0)
+		seq_puts(m, "  -ve ");
+	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000)
+		seq_printf(m, "%3luns ", diff.tv_nsec);
+	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000)
+		seq_printf(m, "%3luus ", diff.tv_nsec / 1000);
+	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000)
+		seq_printf(m, "%3lums ", diff.tv_nsec / 1000000);
+	else if (diff.tv_sec <= 1)
+		seq_puts(m, "   1s ");
+	else if (diff.tv_sec < 60)
+		seq_printf(m, "%4lus ", diff.tv_sec);
+	else if (diff.tv_sec < 60 * 60)
+		seq_printf(m, "%4lum ", diff.tv_sec / 60);
+	else if (diff.tv_sec < 60 * 60 * 24)
+		seq_printf(m, "%4luh ", diff.tv_sec / 3600);
+	else
+		seq_puts(m, "exces ");
+}
+
+/*
+ * Describe a slow work item for debugfs
+ */
+static int slow_work_runqueue_show(struct seq_file *m, void *v)
+{
+	struct slow_work *work;
+	struct list_head *p = v;
+	unsigned long id;
+
+	switch ((unsigned long) v) {
+	case 1:
+		seq_puts(m, "THR PID   ITEM ADDR        FL MARK  DESC\n");
+		return 0;
+	case 2:
+		seq_puts(m, "=== ===== ================ == ===== ==========\n");
+		return 0;
+
+	case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1:
+		id = (unsigned long) v - 3;
+
+		read_lock(&slow_work_execs_lock);
+		work = slow_work_execs[id];
+		if (work) {
+			smp_read_barrier_depends();
+
+			seq_printf(m, "%3lu %5d %16p %2lx ",
+				   id, slow_work_pids[id], work, work->flags);
+			slow_work_print_mark(m, work);
+
+			if (work->ops->desc)
+				work->ops->desc(work, m);
+			seq_putc(m, '\n');
+		}
+		read_unlock(&slow_work_execs_lock);
+		return 0;
+
+	default:
+		work = list_entry(p, struct slow_work, link);
+		seq_printf(m, "%3s     - %16p %2lx ",
+			   work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq",
+			   work, work->flags);
+		slow_work_print_mark(m, work);
+
+		if (work->ops->desc)
+			work->ops->desc(work, m);
+		seq_putc(m, '\n');
+		return 0;
+	}
+}
+
+/*
+ * map the iterator to a work item
+ */
+static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos)
+{
+	struct list_head *p;
+	unsigned long count, id;
+
+	switch (*_pos >> ITERATOR_SHIFT) {
+	case 0x0:
+		if (*_pos == 0)
+			*_pos = 1;
+		if (*_pos < 3)
+			return (void *)(unsigned long) *_pos;
+		if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT)
+			for (id = *_pos - 3;
+			     id < SLOW_WORK_THREAD_LIMIT;
+			     id++, (*_pos)++)
+				if (slow_work_execs[id])
+					return (void *)(unsigned long) *_pos;
+		*_pos = 0x1UL << ITERATOR_SHIFT;
+
+	case 0x1:
+		count = *_pos & ITERATOR_COUNTER;
+		list_for_each(p, &slow_work_queue) {
+			if (count == 0)
+				return p;
+			count--;
+		}
+		*_pos = 0x2UL << ITERATOR_SHIFT;
+
+	case 0x2:
+		count = *_pos & ITERATOR_COUNTER;
+		list_for_each(p, &vslow_work_queue) {
+			if (count == 0)
+				return p;
+			count--;
+		}
+		*_pos = 0x3UL << ITERATOR_SHIFT;
+
+	default:
+		return NULL;
+	}
+}
+
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos)
+{
+	spin_lock_irq(&slow_work_queue_lock);
+	return slow_work_runqueue_index(m, _pos);
+}
+
+/*
+ * move to the next line
+ */
+static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos)
+{
+	struct list_head *p = v;
+	unsigned long selector = *_pos >> ITERATOR_SHIFT;
+
+	(*_pos)++;
+	switch (selector) {
+	case 0x0:
+		return slow_work_runqueue_index(m, _pos);
+
+	case 0x1:
+		if (*_pos >> ITERATOR_SHIFT == 0x1) {
+			p = p->next;
+			if (p != &slow_work_queue)
+				return p;
+		}
+		*_pos = 0x2UL << ITERATOR_SHIFT;
+		p = &vslow_work_queue;
+
+	case 0x2:
+		if (*_pos >> ITERATOR_SHIFT == 0x2) {
+			p = p->next;
+			if (p != &vslow_work_queue)
+				return p;
+		}
+		*_pos = 0x3UL << ITERATOR_SHIFT;
+
+	default:
+		return NULL;
+	}
+}
+
+/*
+ * clean up after reading
+ */
+static void slow_work_runqueue_stop(struct seq_file *m, void *v)
+{
+	spin_unlock_irq(&slow_work_queue_lock);
+}
+
+static const struct seq_operations slow_work_runqueue_ops = {
+	.start		= slow_work_runqueue_start,
+	.stop		= slow_work_runqueue_stop,
+	.next		= slow_work_runqueue_next,
+	.show		= slow_work_runqueue_show,
+};
+
+/*
+ * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents
+ */
+static int slow_work_runqueue_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &slow_work_runqueue_ops);
+}
+
+const struct file_operations slow_work_runqueue_fops = {
+	.owner		= THIS_MODULE,
+	.open		= slow_work_runqueue_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
diff --git a/kernel/slow-work-proc.c b/kernel/slow-work-proc.c
deleted file mode 100644
index 3988032571f5..000000000000
--- a/kernel/slow-work-proc.c
+++ /dev/null
@@ -1,227 +0,0 @@
-/* Slow work debugging
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/slow-work.h>
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/seq_file.h>
-#include "slow-work.h"
-
-#define ITERATOR_SHIFT		(BITS_PER_LONG - 4)
-#define ITERATOR_SELECTOR	(0xfUL << ITERATOR_SHIFT)
-#define ITERATOR_COUNTER	(~ITERATOR_SELECTOR)
-
-void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m)
-{
-	seq_puts(m, "Slow-work: New thread");
-}
-
-/*
- * Render the time mark field on a work item into a 5-char time with units plus
- * a space
- */
-static void slow_work_print_mark(struct seq_file *m, struct slow_work *work)
-{
-	struct timespec now, diff;
-
-	now = CURRENT_TIME;
-	diff = timespec_sub(now, work->mark);
-
-	if (diff.tv_sec < 0)
-		seq_puts(m, "  -ve ");
-	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000)
-		seq_printf(m, "%3luns ", diff.tv_nsec);
-	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000)
-		seq_printf(m, "%3luus ", diff.tv_nsec / 1000);
-	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000)
-		seq_printf(m, "%3lums ", diff.tv_nsec / 1000000);
-	else if (diff.tv_sec <= 1)
-		seq_puts(m, "   1s ");
-	else if (diff.tv_sec < 60)
-		seq_printf(m, "%4lus ", diff.tv_sec);
-	else if (diff.tv_sec < 60 * 60)
-		seq_printf(m, "%4lum ", diff.tv_sec / 60);
-	else if (diff.tv_sec < 60 * 60 * 24)
-		seq_printf(m, "%4luh ", diff.tv_sec / 3600);
-	else
-		seq_puts(m, "exces ");
-}
-
-/*
- * Describe a slow work item for /proc
- */
-static int slow_work_runqueue_show(struct seq_file *m, void *v)
-{
-	struct slow_work *work;
-	struct list_head *p = v;
-	unsigned long id;
-
-	switch ((unsigned long) v) {
-	case 1:
-		seq_puts(m, "THR PID   ITEM ADDR        FL MARK  DESC\n");
-		return 0;
-	case 2:
-		seq_puts(m, "=== ===== ================ == ===== ==========\n");
-		return 0;
-
-	case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1:
-		id = (unsigned long) v - 3;
-
-		read_lock(&slow_work_execs_lock);
-		work = slow_work_execs[id];
-		if (work) {
-			smp_read_barrier_depends();
-
-			seq_printf(m, "%3lu %5d %16p %2lx ",
-				   id, slow_work_pids[id], work, work->flags);
-			slow_work_print_mark(m, work);
-
-			if (work->ops->desc)
-				work->ops->desc(work, m);
-			seq_putc(m, '\n');
-		}
-		read_unlock(&slow_work_execs_lock);
-		return 0;
-
-	default:
-		work = list_entry(p, struct slow_work, link);
-		seq_printf(m, "%3s     - %16p %2lx ",
-			   work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq",
-			   work, work->flags);
-		slow_work_print_mark(m, work);
-
-		if (work->ops->desc)
-			work->ops->desc(work, m);
-		seq_putc(m, '\n');
-		return 0;
-	}
-}
-
-/*
- * map the iterator to a work item
- */
-static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos)
-{
-	struct list_head *p;
-	unsigned long count, id;
-
-	switch (*_pos >> ITERATOR_SHIFT) {
-	case 0x0:
-		if (*_pos == 0)
-			*_pos = 1;
-		if (*_pos < 3)
-			return (void *)(unsigned long) *_pos;
-		if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT)
-			for (id = *_pos - 3;
-			     id < SLOW_WORK_THREAD_LIMIT;
-			     id++, (*_pos)++)
-				if (slow_work_execs[id])
-					return (void *)(unsigned long) *_pos;
-		*_pos = 0x1UL << ITERATOR_SHIFT;
-
-	case 0x1:
-		count = *_pos & ITERATOR_COUNTER;
-		list_for_each(p, &slow_work_queue) {
-			if (count == 0)
-				return p;
-			count--;
-		}
-		*_pos = 0x2UL << ITERATOR_SHIFT;
-
-	case 0x2:
-		count = *_pos & ITERATOR_COUNTER;
-		list_for_each(p, &vslow_work_queue) {
-			if (count == 0)
-				return p;
-			count--;
-		}
-		*_pos = 0x3UL << ITERATOR_SHIFT;
-
-	default:
-		return NULL;
-	}
-}
-
-/*
- * set up the iterator to start reading from the first line
- */
-static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos)
-{
-	spin_lock_irq(&slow_work_queue_lock);
-	return slow_work_runqueue_index(m, _pos);
-}
-
-/*
- * move to the next line
- */
-static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos)
-{
-	struct list_head *p = v;
-	unsigned long selector = *_pos >> ITERATOR_SHIFT;
-
-	(*_pos)++;
-	switch (selector) {
-	case 0x0:
-		return slow_work_runqueue_index(m, _pos);
-
-	case 0x1:
-		if (*_pos >> ITERATOR_SHIFT == 0x1) {
-			p = p->next;
-			if (p != &slow_work_queue)
-				return p;
-		}
-		*_pos = 0x2UL << ITERATOR_SHIFT;
-		p = &vslow_work_queue;
-
-	case 0x2:
-		if (*_pos >> ITERATOR_SHIFT == 0x2) {
-			p = p->next;
-			if (p != &vslow_work_queue)
-				return p;
-		}
-		*_pos = 0x3UL << ITERATOR_SHIFT;
-
-	default:
-		return NULL;
-	}
-}
-
-/*
- * clean up after reading
- */
-static void slow_work_runqueue_stop(struct seq_file *m, void *v)
-{
-	spin_unlock_irq(&slow_work_queue_lock);
-}
-
-static const struct seq_operations slow_work_runqueue_ops = {
-	.start		= slow_work_runqueue_start,
-	.stop		= slow_work_runqueue_stop,
-	.next		= slow_work_runqueue_next,
-	.show		= slow_work_runqueue_show,
-};
-
-/*
- * open "/proc/slow_work_rq" to list queue contents
- */
-static int slow_work_runqueue_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &slow_work_runqueue_ops);
-}
-
-const struct file_operations slow_work_runqueue_fops = {
-	.owner		= THIS_MODULE,
-	.open		= slow_work_runqueue_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index b5c17f15f9de..00889bd3c590 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -16,7 +16,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/wait.h>
-#include <linux/proc_fs.h>
+#include <linux/debugfs.h>
 #include "slow-work.h"
 
 static void slow_work_cull_timeout(unsigned long);
@@ -138,7 +138,7 @@ static void slow_work_clear_thread_processing(int id) {}
 /*
  * Data for tracking currently executing items for indication through /proc
  */
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT];
 pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT];
 DEFINE_RWLOCK(slow_work_execs_lock);
@@ -823,7 +823,7 @@ static void slow_work_new_thread_execute(struct slow_work *work)
 static const struct slow_work_ops slow_work_new_thread_ops = {
 	.owner		= THIS_MODULE,
 	.execute	= slow_work_new_thread_execute,
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 	.desc		= slow_work_new_thread_desc,
 #endif
 };
@@ -1055,9 +1055,15 @@ static int __init init_slow_work(void)
 	if (slow_work_max_max_threads < nr_cpus * 2)
 		slow_work_max_max_threads = nr_cpus * 2;
 #endif
-#ifdef CONFIG_SLOW_WORK_PROC
-	proc_create("slow_work_rq", S_IFREG | 0400, NULL,
-		    &slow_work_runqueue_fops);
+#ifdef CONFIG_SLOW_WORK_DEBUG
+	{
+		struct dentry *dbdir;
+
+		dbdir = debugfs_create_dir("slow_work", NULL);
+		if (dbdir && !IS_ERR(dbdir))
+			debugfs_create_file("runqueue", S_IFREG | 0400, dbdir,
+					    NULL, &slow_work_runqueue_fops);
+	}
 #endif
 	return 0;
 }
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
index 3c2f007f3ad6..321f3c59d732 100644
--- a/kernel/slow-work.h
+++ b/kernel/slow-work.h
@@ -19,7 +19,7 @@
 /*
  * slow-work.c
  */
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 extern struct slow_work *slow_work_execs[];
 extern pid_t slow_work_pids[];
 extern rwlock_t slow_work_execs_lock;
@@ -30,9 +30,9 @@ extern struct list_head vslow_work_queue;
 extern spinlock_t slow_work_queue_lock;
 
 /*
- * slow-work-proc.c
+ * slow-work-debugfs.c
  */
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 extern const struct file_operations slow_work_runqueue_fops;
 
 extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
-- 
cgit v1.2.3


From bf56a4ea9f1683c5b223fd3a5dbea23f1fa91c34 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 1 Dec 2009 16:23:20 +0800
Subject: trace_syscalls: Remove unused event_syscall_enter and
 event_syscall_exit

fix event_enter_##sname->event
fix event_exit_##sname->event

remove unused event_syscall_enter and event_syscall_exit

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Jason Baron <jbaron@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4B14D278.4090209@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/syscalls.h      | 4 ++--
 include/trace/syscall.h       | 2 --
 kernel/trace/trace_syscalls.c | 8 --------
 3 files changed, 2 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index b50974a93af0..2f7c539ab96d 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -178,7 +178,7 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 	  event_enter_##sname = {					\
 		.name                   = "sys_enter"#sname,		\
 		.system                 = "syscalls",			\
-		.event                  = &event_syscall_enter,		\
+		.event                  = &enter_syscall_print_##sname,	\
 		.raw_init		= init_enter_##sname,		\
 		.show_format		= syscall_enter_format,		\
 		.define_fields		= syscall_enter_define_fields,	\
@@ -214,7 +214,7 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 	  event_exit_##sname = {					\
 		.name                   = "sys_exit"#sname,		\
 		.system                 = "syscalls",			\
-		.event                  = &event_syscall_exit,		\
+		.event                  = &exit_syscall_print_##sname,	\
 		.raw_init		= init_exit_##sname,		\
 		.show_format		= syscall_exit_format,		\
 		.define_fields		= syscall_exit_define_fields,	\
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 51ee17d3632a..5f8827c92db7 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -37,8 +37,6 @@ extern unsigned long arch_syscall_addr(int nr);
 extern int syscall_name_to_nr(char *name);
 void set_syscall_enter_id(int num, int id);
 void set_syscall_exit_id(int num, int id);
-extern struct trace_event event_syscall_enter;
-extern struct trace_event event_syscall_exit;
 
 extern int syscall_enter_format(struct ftrace_event_call *call,
 				struct trace_seq *s);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 63aa8070365d..00d6e176f5b6 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -444,14 +444,6 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
 	mutex_unlock(&syscall_trace_lock);
 }
 
-struct trace_event event_syscall_enter = {
-	.trace			= print_syscall_enter,
-};
-
-struct trace_event event_syscall_exit = {
-	.trace			= print_syscall_exit,
-};
-
 int __init init_ftrace_syscalls(void)
 {
 	struct syscall_metadata *meta;
-- 
cgit v1.2.3


From 31c16b13349970b2684248c7d8608d2a96ae135d Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 1 Dec 2009 16:23:30 +0800
Subject: trace_syscalls: Set event_enter_##sname->data to its metadata

Set event_enter_##sname->data to its metadata,
it makes codes simpler.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Jason Baron <jbaron@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4B14D282.7050709@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/syscalls.h      |  6 ++++--
 include/trace/syscall.h       |  2 +-
 kernel/trace/trace_syscalls.c | 36 +++++++++++-------------------------
 3 files changed, 16 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 2f7c539ab96d..d3c9fd01a110 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -153,6 +153,7 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 #define __SC_STR_TDECL6(t, a, ...)	#t, __SC_STR_TDECL5(__VA_ARGS__)
 
 #define SYSCALL_TRACE_ENTER_EVENT(sname)				\
+	static const struct syscall_metadata __syscall_meta_##sname;	\
 	static struct ftrace_event_call event_enter_##sname;		\
 	struct trace_event enter_syscall_print_##sname = {		\
 		.trace                  = print_syscall_enter,		\
@@ -184,11 +185,12 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 		.define_fields		= syscall_enter_define_fields,	\
 		.regfunc		= reg_event_syscall_enter,	\
 		.unregfunc		= unreg_event_syscall_enter,	\
-		.data			= "sys"#sname,			\
+		.data			= (void *)&__syscall_meta_##sname,\
 		TRACE_SYS_ENTER_PROFILE_INIT(sname)			\
 	}
 
 #define SYSCALL_TRACE_EXIT_EVENT(sname)					\
+	static const struct syscall_metadata __syscall_meta_##sname;	\
 	static struct ftrace_event_call event_exit_##sname;		\
 	struct trace_event exit_syscall_print_##sname = {		\
 		.trace                  = print_syscall_exit,		\
@@ -220,7 +222,7 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 		.define_fields		= syscall_exit_define_fields,	\
 		.regfunc		= reg_event_syscall_exit,	\
 		.unregfunc		= unreg_event_syscall_exit,	\
-		.data			= "sys"#sname,			\
+		.data			= (void *)&__syscall_meta_##sname,\
 		TRACE_SYS_EXIT_PROFILE_INIT(sname)			\
 	}
 
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 5f8827c92db7..c5265c81c4e7 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -34,7 +34,7 @@ struct syscall_metadata {
 
 #ifdef CONFIG_FTRACE_SYSCALLS
 extern unsigned long arch_syscall_addr(int nr);
-extern int syscall_name_to_nr(char *name);
+extern int syscall_name_to_nr(const char *name);
 void set_syscall_enter_id(int num, int id);
 void set_syscall_exit_id(int num, int id);
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 00d6e176f5b6..39649b1675dd 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -51,7 +51,7 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
 	return syscalls_metadata[nr];
 }
 
-int syscall_name_to_nr(char *name)
+int syscall_name_to_nr(const char *name)
 {
 	int i;
 
@@ -172,18 +172,11 @@ extern char *__bad_type_size(void);
 int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
 {
 	int i;
-	int nr;
 	int ret;
-	struct syscall_metadata *entry;
+	struct syscall_metadata *entry = call->data;
 	struct syscall_trace_enter trace;
 	int offset = offsetof(struct syscall_trace_enter, args);
 
-	nr = syscall_name_to_nr(call->data);
-	entry = syscall_nr_to_meta(nr);
-
-	if (!entry)
-		return 0;
-
 	ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
 			       "\tsigned:%u;\n",
 			       SYSCALL_FIELD(int, nr));
@@ -245,18 +238,11 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
 int syscall_enter_define_fields(struct ftrace_event_call *call)
 {
 	struct syscall_trace_enter trace;
-	struct syscall_metadata *meta;
+	struct syscall_metadata *meta = call->data;
 	int ret;
-	int nr;
 	int i;
 	int offset = offsetof(typeof(trace), args);
 
-	nr = syscall_name_to_nr(call->data);
-	meta = syscall_nr_to_meta(nr);
-
-	if (!meta)
-		return 0;
-
 	ret = trace_define_common_fields(call);
 	if (ret)
 		return ret;
@@ -366,9 +352,9 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
 {
 	int ret = 0;
 	int num;
-	char *name;
+	const char *name;
 
-	name = (char *)call->data;
+	name = ((struct syscall_metadata *)call->data)->name;
 	num = syscall_name_to_nr(name);
 	if (num < 0 || num >= NR_syscalls)
 		return -ENOSYS;
@@ -389,9 +375,9 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
 void unreg_event_syscall_enter(struct ftrace_event_call *call)
 {
 	int num;
-	char *name;
+	const char *name;
 
-	name = (char *)call->data;
+	name = ((struct syscall_metadata *)call->data)->name;
 	num = syscall_name_to_nr(name);
 	if (num < 0 || num >= NR_syscalls)
 		return;
@@ -407,9 +393,9 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
 {
 	int ret = 0;
 	int num;
-	char *name;
+	const char *name;
 
-	name = call->data;
+	name = ((struct syscall_metadata *)call->data)->name;
 	num = syscall_name_to_nr(name);
 	if (num < 0 || num >= NR_syscalls)
 		return -ENOSYS;
@@ -430,9 +416,9 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
 void unreg_event_syscall_exit(struct ftrace_event_call *call)
 {
 	int num;
-	char *name;
+	const char *name;
 
-	name = call->data;
+	name = ((struct syscall_metadata *)call->data)->name;
 	num = syscall_name_to_nr(name);
 	if (num < 0 || num >= NR_syscalls)
 		return;
-- 
cgit v1.2.3


From fcc19438dda38dacc8c144e2db3ebc6b9fd4f8b8 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 1 Dec 2009 16:23:36 +0800
Subject: trace_syscalls: Remove enter_id exit_id

use ->enter_event->id instead of ->enter_id
use ->exit_event->id instead of ->exit_id

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Jason Baron <jbaron@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4B14D288.7030001@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/syscalls.h      |  2 --
 include/trace/syscall.h       |  6 ------
 kernel/trace/trace_syscalls.c | 30 ++++++++++--------------------
 3 files changed, 10 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d3c9fd01a110..b9af87560adb 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -168,7 +168,6 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 		if (!id)						\
 			return -ENODEV;					\
 		event_enter_##sname.id = id;				\
-		set_syscall_enter_id(num, id);				\
 		INIT_LIST_HEAD(&event_enter_##sname.fields);		\
 		return 0;						\
 	}								\
@@ -205,7 +204,6 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 		if (!id)						\
 			return -ENODEV;					\
 		event_exit_##sname.id = id;				\
-		set_syscall_exit_id(num, id);				\
 		INIT_LIST_HEAD(&event_exit_##sname.fields);		\
 		return 0;						\
 	}								\
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index c5265c81c4e7..ca09561cd578 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -15,8 +15,6 @@
  * @nb_args: number of parameters it takes
  * @types: list of types as strings
  * @args: list of args as strings (args[i] matches types[i])
- * @enter_id: associated ftrace enter event id
- * @exit_id: associated ftrace exit event id
  * @enter_event: associated syscall_enter trace event
  * @exit_event: associated syscall_exit trace event
  */
@@ -25,8 +23,6 @@ struct syscall_metadata {
 	int		nb_args;
 	const char	**types;
 	const char	**args;
-	int		enter_id;
-	int		exit_id;
 
 	struct ftrace_event_call *enter_event;
 	struct ftrace_event_call *exit_event;
@@ -35,8 +31,6 @@ struct syscall_metadata {
 #ifdef CONFIG_FTRACE_SYSCALLS
 extern unsigned long arch_syscall_addr(int nr);
 extern int syscall_name_to_nr(const char *name);
-void set_syscall_enter_id(int num, int id);
-void set_syscall_exit_id(int num, int id);
 
 extern int syscall_enter_format(struct ftrace_event_call *call,
 				struct trace_seq *s);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 39649b1675dd..27eb18d69222 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -67,16 +67,6 @@ int syscall_name_to_nr(const char *name)
 	return -1;
 }
 
-void set_syscall_enter_id(int num, int id)
-{
-	syscalls_metadata[num]->enter_id = id;
-}
-
-void set_syscall_exit_id(int num, int id)
-{
-	syscalls_metadata[num]->exit_id = id;
-}
-
 enum print_line_t
 print_syscall_enter(struct trace_iterator *iter, int flags)
 {
@@ -93,7 +83,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
 	if (!entry)
 		goto end;
 
-	if (entry->enter_id != ent->type) {
+	if (entry->enter_event->id != ent->type) {
 		WARN_ON_ONCE(1);
 		goto end;
 	}
@@ -148,7 +138,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
 		return TRACE_TYPE_HANDLED;
 	}
 
-	if (entry->exit_id != ent->type) {
+	if (entry->exit_event->id != ent->type) {
 		WARN_ON_ONCE(1);
 		return TRACE_TYPE_UNHANDLED;
 	}
@@ -302,8 +292,8 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
 
 	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
 
-	event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id,
-						  size, 0, 0);
+	event = trace_current_buffer_lock_reserve(&buffer,
+			sys_data->enter_event->id, size, 0, 0);
 	if (!event)
 		return;
 
@@ -334,8 +324,8 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
 	if (!sys_data)
 		return;
 
-	event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id,
-				sizeof(*entry), 0, 0);
+	event = trace_current_buffer_lock_reserve(&buffer,
+			sys_data->exit_event->id, sizeof(*entry), 0, 0);
 	if (!event)
 		return;
 
@@ -510,11 +500,11 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 
 	rec = (struct syscall_trace_enter *) raw_data;
 	tracing_generic_entry_update(&rec->ent, 0, 0);
-	rec->ent.type = sys_data->enter_id;
+	rec->ent.type = sys_data->enter_event->id;
 	rec->nr = syscall_nr;
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 			       (unsigned long *)&rec->args);
-	perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
+	perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size);
 
 end:
 	perf_swevent_put_recursion_context(rctx);
@@ -615,11 +605,11 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 	rec = (struct syscall_trace_exit *)raw_data;
 
 	tracing_generic_entry_update(&rec->ent, 0, 0);
-	rec->ent.type = sys_data->exit_id;
+	rec->ent.type = sys_data->exit_event->id;
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
 
-	perf_tp_event(sys_data->exit_id, 0, 1, rec, size);
+	perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size);
 
 end:
 	perf_swevent_put_recursion_context(rctx);
-- 
cgit v1.2.3


From c252f65793874b56d50395ab604db465ce688665 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 1 Dec 2009 16:23:47 +0800
Subject: trace_syscalls: Add syscall_nr field to struct syscall_metadata

Add syscall_nr field to struct syscall_metadata,
it helps us to get syscall number easier.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Jason Baron <jbaron@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4B14D293.6090800@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/syscalls.h      |  4 ++--
 include/trace/syscall.h       |  3 ++-
 kernel/trace/trace_syscalls.c | 22 +++++++++-------------
 3 files changed, 13 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index b9af87560adb..3c280d7ecb76 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -161,7 +161,7 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 	static int init_enter_##sname(struct ftrace_event_call *call)	\
 	{								\
 		int num, id;						\
-		num = syscall_name_to_nr("sys"#sname);			\
+		num = __syscall_meta_##sname.syscall_nr;		\
 		if (num < 0)						\
 			return -ENOSYS;					\
 		id = register_ftrace_event(&enter_syscall_print_##sname);\
@@ -197,7 +197,7 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 	static int init_exit_##sname(struct ftrace_event_call *call)	\
 	{								\
 		int num, id;						\
-		num = syscall_name_to_nr("sys"#sname);			\
+		num = __syscall_meta_##sname.syscall_nr;		\
 		if (num < 0)						\
 			return -ENOSYS;					\
 		id = register_ftrace_event(&exit_syscall_print_##sname);\
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index ca09561cd578..1531eef3071f 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -12,6 +12,7 @@
  * A syscall entry in the ftrace syscalls array.
  *
  * @name: name of the syscall
+ * @syscall_nr: number of the syscall
  * @nb_args: number of parameters it takes
  * @types: list of types as strings
  * @args: list of args as strings (args[i] matches types[i])
@@ -20,6 +21,7 @@
  */
 struct syscall_metadata {
 	const char	*name;
+	int		syscall_nr;
 	int		nb_args;
 	const char	**types;
 	const char	**args;
@@ -30,7 +32,6 @@ struct syscall_metadata {
 
 #ifdef CONFIG_FTRACE_SYSCALLS
 extern unsigned long arch_syscall_addr(int nr);
-extern int syscall_name_to_nr(const char *name);
 
 extern int syscall_enter_format(struct ftrace_event_call *call,
 				struct trace_seq *s);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 27eb18d69222..144cc14d8551 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -51,7 +51,7 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
 	return syscalls_metadata[nr];
 }
 
-int syscall_name_to_nr(const char *name)
+static int syscall_name_to_nr(const char *name)
 {
 	int i;
 
@@ -342,10 +342,8 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
 {
 	int ret = 0;
 	int num;
-	const char *name;
 
-	name = ((struct syscall_metadata *)call->data)->name;
-	num = syscall_name_to_nr(name);
+	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 	if (num < 0 || num >= NR_syscalls)
 		return -ENOSYS;
 	mutex_lock(&syscall_trace_lock);
@@ -365,10 +363,8 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
 void unreg_event_syscall_enter(struct ftrace_event_call *call)
 {
 	int num;
-	const char *name;
 
-	name = ((struct syscall_metadata *)call->data)->name;
-	num = syscall_name_to_nr(name);
+	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 	if (num < 0 || num >= NR_syscalls)
 		return;
 	mutex_lock(&syscall_trace_lock);
@@ -383,10 +379,8 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
 {
 	int ret = 0;
 	int num;
-	const char *name;
 
-	name = ((struct syscall_metadata *)call->data)->name;
-	num = syscall_name_to_nr(name);
+	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 	if (num < 0 || num >= NR_syscalls)
 		return -ENOSYS;
 	mutex_lock(&syscall_trace_lock);
@@ -406,10 +400,8 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
 void unreg_event_syscall_exit(struct ftrace_event_call *call)
 {
 	int num;
-	const char *name;
 
-	name = ((struct syscall_metadata *)call->data)->name;
-	num = syscall_name_to_nr(name);
+	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 	if (num < 0 || num >= NR_syscalls)
 		return;
 	mutex_lock(&syscall_trace_lock);
@@ -436,6 +428,10 @@ int __init init_ftrace_syscalls(void)
 	for (i = 0; i < NR_syscalls; i++) {
 		addr = arch_syscall_addr(i);
 		meta = find_syscall_meta(addr);
+		if (!meta)
+			continue;
+
+		meta->syscall_nr = i;
 		syscalls_metadata[i] = meta;
 	}
 
-- 
cgit v1.2.3


From a1301da0997bf73c44dbe584e9070a13adc89672 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 1 Dec 2009 16:23:55 +0800
Subject: trace_syscalls: Remove duplicate init_enter_##sname()

use only one init_syscall_trace instead of
many init_enter_##sname()/init_exit_##sname()

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Jason Baron <jbaron@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4B14D29B.6090708@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/syscalls.h      | 30 ++----------------------------
 include/trace/syscall.h       |  1 +
 kernel/trace/trace_syscalls.c | 12 ++++++++++++
 3 files changed, 15 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3c280d7ecb76..cf0d923ea40e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -158,19 +158,6 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 	struct trace_event enter_syscall_print_##sname = {		\
 		.trace                  = print_syscall_enter,		\
 	};								\
-	static int init_enter_##sname(struct ftrace_event_call *call)	\
-	{								\
-		int num, id;						\
-		num = __syscall_meta_##sname.syscall_nr;		\
-		if (num < 0)						\
-			return -ENOSYS;					\
-		id = register_ftrace_event(&enter_syscall_print_##sname);\
-		if (!id)						\
-			return -ENODEV;					\
-		event_enter_##sname.id = id;				\
-		INIT_LIST_HEAD(&event_enter_##sname.fields);		\
-		return 0;						\
-	}								\
 	TRACE_SYS_ENTER_PROFILE(sname);					\
 	static struct ftrace_event_call __used				\
 	  __attribute__((__aligned__(4)))				\
@@ -179,7 +166,7 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 		.name                   = "sys_enter"#sname,		\
 		.system                 = "syscalls",			\
 		.event                  = &enter_syscall_print_##sname,	\
-		.raw_init		= init_enter_##sname,		\
+		.raw_init		= init_syscall_trace,		\
 		.show_format		= syscall_enter_format,		\
 		.define_fields		= syscall_enter_define_fields,	\
 		.regfunc		= reg_event_syscall_enter,	\
@@ -194,19 +181,6 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 	struct trace_event exit_syscall_print_##sname = {		\
 		.trace                  = print_syscall_exit,		\
 	};								\
-	static int init_exit_##sname(struct ftrace_event_call *call)	\
-	{								\
-		int num, id;						\
-		num = __syscall_meta_##sname.syscall_nr;		\
-		if (num < 0)						\
-			return -ENOSYS;					\
-		id = register_ftrace_event(&exit_syscall_print_##sname);\
-		if (!id)						\
-			return -ENODEV;					\
-		event_exit_##sname.id = id;				\
-		INIT_LIST_HEAD(&event_exit_##sname.fields);		\
-		return 0;						\
-	}								\
 	TRACE_SYS_EXIT_PROFILE(sname);					\
 	static struct ftrace_event_call __used				\
 	  __attribute__((__aligned__(4)))				\
@@ -215,7 +189,7 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 		.name                   = "sys_exit"#sname,		\
 		.system                 = "syscalls",			\
 		.event                  = &exit_syscall_print_##sname,	\
-		.raw_init		= init_exit_##sname,		\
+		.raw_init		= init_syscall_trace,		\
 		.show_format		= syscall_exit_format,		\
 		.define_fields		= syscall_exit_define_fields,	\
 		.regfunc		= reg_event_syscall_exit,	\
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 1531eef3071f..dff9371e5274 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -32,6 +32,7 @@ struct syscall_metadata {
 
 #ifdef CONFIG_FTRACE_SYSCALLS
 extern unsigned long arch_syscall_addr(int nr);
+extern int init_syscall_trace(struct ftrace_event_call *call);
 
 extern int syscall_enter_format(struct ftrace_event_call *call,
 				struct trace_seq *s);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 144cc14d8551..c6514093c95a 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -412,6 +412,18 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
 	mutex_unlock(&syscall_trace_lock);
 }
 
+int init_syscall_trace(struct ftrace_event_call *call)
+{
+	int id;
+
+	id = register_ftrace_event(call->event);
+	if (!id)
+		return -ENODEV;
+	call->id = id;
+	INIT_LIST_HEAD(&call->fields);
+	return 0;
+}
+
 int __init init_ftrace_syscalls(void)
 {
 	struct syscall_metadata *meta;
-- 
cgit v1.2.3


From 3bbe84e9d385205d638035ee9dcc4db1b486ea08 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 1 Dec 2009 16:24:01 +0800
Subject: trace_syscalls: Simplify syscall profile

use only one prof_sysenter_enable() instead of
prof_sysenter_enable_##sname()

use only one prof_sysenter_disable() instead of
prof_sysenter_disable_##sname()

use only one prof_sysexit_enable() instead of
prof_sysexit_enable_##sname()

use only one prof_sysexit_disable() instead of
prof_sysexit_disable_##sname()

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Jason Baron <jbaron@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4B14D2A1.8060304@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/syscalls.h      | 31 ++++---------------------------
 include/trace/syscall.h       |  8 ++++----
 kernel/trace/trace_syscalls.c | 24 ++++++++----------------
 3 files changed, 16 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index cf0d923ea40e..c2df3a593236 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -99,37 +99,16 @@ struct perf_event_attr;
 #define __SC_TEST6(t6, a6, ...)	__SC_TEST(t6); __SC_TEST5(__VA_ARGS__)
 
 #ifdef CONFIG_EVENT_PROFILE
-#define TRACE_SYS_ENTER_PROFILE(sname)					       \
-static int prof_sysenter_enable_##sname(struct ftrace_event_call *unused)      \
-{									       \
-	return reg_prof_syscall_enter("sys"#sname);			       \
-}									       \
-									       \
-static void prof_sysenter_disable_##sname(struct ftrace_event_call *unused)    \
-{									       \
-	unreg_prof_syscall_enter("sys"#sname);				       \
-}
-
-#define TRACE_SYS_EXIT_PROFILE(sname)					       \
-static int prof_sysexit_enable_##sname(struct ftrace_event_call *unused)       \
-{									       \
-	return reg_prof_syscall_exit("sys"#sname);			       \
-}									       \
-									       \
-static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
-{                                                                              \
-	unreg_prof_syscall_exit("sys"#sname);				       \
-}
 
 #define TRACE_SYS_ENTER_PROFILE_INIT(sname)				       \
 	.profile_count = ATOMIC_INIT(-1),				       \
-	.profile_enable = prof_sysenter_enable_##sname,			       \
-	.profile_disable = prof_sysenter_disable_##sname,
+	.profile_enable = prof_sysenter_enable,				       \
+	.profile_disable = prof_sysenter_disable,
 
 #define TRACE_SYS_EXIT_PROFILE_INIT(sname)				       \
 	.profile_count = ATOMIC_INIT(-1),				       \
-	.profile_enable = prof_sysexit_enable_##sname,			       \
-	.profile_disable = prof_sysexit_disable_##sname,
+	.profile_enable = prof_sysexit_enable,				       \
+	.profile_disable = prof_sysexit_disable,
 #else
 #define TRACE_SYS_ENTER_PROFILE(sname)
 #define TRACE_SYS_ENTER_PROFILE_INIT(sname)
@@ -158,7 +137,6 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 	struct trace_event enter_syscall_print_##sname = {		\
 		.trace                  = print_syscall_enter,		\
 	};								\
-	TRACE_SYS_ENTER_PROFILE(sname);					\
 	static struct ftrace_event_call __used				\
 	  __attribute__((__aligned__(4)))				\
 	  __attribute__((section("_ftrace_events")))			\
@@ -181,7 +159,6 @@ static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 	struct trace_event exit_syscall_print_##sname = {		\
 		.trace                  = print_syscall_exit,		\
 	};								\
-	TRACE_SYS_EXIT_PROFILE(sname);					\
 	static struct ftrace_event_call __used				\
 	  __attribute__((__aligned__(4)))				\
 	  __attribute__((section("_ftrace_events")))			\
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index dff9371e5274..961fda3556bb 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -50,10 +50,10 @@ enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags);
 enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags);
 #endif
 #ifdef CONFIG_EVENT_PROFILE
-int reg_prof_syscall_enter(char *name);
-void unreg_prof_syscall_enter(char *name);
-int reg_prof_syscall_exit(char *name);
-void unreg_prof_syscall_exit(char *name);
+int prof_sysenter_enable(struct ftrace_event_call *call);
+void prof_sysenter_disable(struct ftrace_event_call *call);
+int prof_sysexit_enable(struct ftrace_event_call *call);
+void prof_sysexit_disable(struct ftrace_event_call *call);
 
 #endif
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index c6514093c95a..1e85b6cc26aa 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -520,14 +520,12 @@ end_recursion:
 	local_irq_restore(flags);
 }
 
-int reg_prof_syscall_enter(char *name)
+int prof_sysenter_enable(struct ftrace_event_call *call)
 {
 	int ret = 0;
 	int num;
 
-	num = syscall_name_to_nr(name);
-	if (num < 0 || num >= NR_syscalls)
-		return -ENOSYS;
+	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 
 	mutex_lock(&syscall_trace_lock);
 	if (!sys_prof_refcount_enter)
@@ -543,13 +541,11 @@ int reg_prof_syscall_enter(char *name)
 	return ret;
 }
 
-void unreg_prof_syscall_enter(char *name)
+void prof_sysenter_disable(struct ftrace_event_call *call)
 {
 	int num;
 
-	num = syscall_name_to_nr(name);
-	if (num < 0 || num >= NR_syscalls)
-		return;
+	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 
 	mutex_lock(&syscall_trace_lock);
 	sys_prof_refcount_enter--;
@@ -625,14 +621,12 @@ end_recursion:
 	local_irq_restore(flags);
 }
 
-int reg_prof_syscall_exit(char *name)
+int prof_sysexit_enable(struct ftrace_event_call *call)
 {
 	int ret = 0;
 	int num;
 
-	num = syscall_name_to_nr(name);
-	if (num < 0 || num >= NR_syscalls)
-		return -ENOSYS;
+	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 
 	mutex_lock(&syscall_trace_lock);
 	if (!sys_prof_refcount_exit)
@@ -648,13 +642,11 @@ int reg_prof_syscall_exit(char *name)
 	return ret;
 }
 
-void unreg_prof_syscall_exit(char *name)
+void prof_sysexit_disable(struct ftrace_event_call *call)
 {
 	int num;
 
-	num = syscall_name_to_nr(name);
-	if (num < 0 || num >= NR_syscalls)
-		return;
+	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 
 	mutex_lock(&syscall_trace_lock);
 	sys_prof_refcount_exit--;
-- 
cgit v1.2.3


From a5ee155136b4a8f4ab0e4c9c064b661da475e298 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 29 Nov 2009 15:45:58 +0000
Subject: net: NETDEV_UNREGISTER_PERNET -> NETDEV_UNREGISTER_BATCH

The motivation for an additional notifier in batched netdevice
notification (rt_do_flush) only needs to be called once per batch not
once per namespace.

For further batching improvements I need a guarantee that the
netdevices are unregistered in order allowing me to unregister an all
of the network devices in a network namespace at the same time with
the guarantee that the loopback device is really and truly
unregistered last.

Additionally it appears that we moved the route cache flush after
the final synchronize_net, which seems wrong and there was no
explanation.  So I have restored the original location of the final
synchronize_net.

Cc: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/notifier.h |  2 +-
 include/net/route.h      |  1 +
 net/core/dev.c           | 36 +++++++++---------------------------
 net/ipv4/fib_frontend.c  |  4 +++-
 net/ipv4/route.c         |  6 ++++++
 5 files changed, 20 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index b0c3671d463c..fee6c2f68075 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -202,7 +202,7 @@ static inline int notifier_to_errno(int ret)
 #define NETDEV_BONDING_OLDTYPE  0x000E
 #define NETDEV_BONDING_NEWTYPE  0x000F
 #define NETDEV_POST_INIT	0x0010
-#define NETDEV_UNREGISTER_PERNET 0x0011
+#define NETDEV_UNREGISTER_BATCH 0x0011
 
 #define SYS_DOWN	0x0001	/* Notify of system down */
 #define SYS_RESTART	SYS_DOWN
diff --git a/include/net/route.h b/include/net/route.h
index cfb4c071a136..bce6dd68d27b 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -108,6 +108,7 @@ extern int		ip_rt_init(void);
 extern void		ip_rt_redirect(__be32 old_gw, __be32 dst, __be32 new_gw,
 				       __be32 src, struct net_device *dev);
 extern void		rt_cache_flush(struct net *net, int how);
+extern void		rt_cache_flush_batch(void);
 extern int		__ip_route_output_key(struct net *, struct rtable **, const struct flowi *flp);
 extern int		ip_route_output_key(struct net *, struct rtable **, struct flowi *flp);
 extern int		ip_route_output_flow(struct net *, struct rtable **rp, struct flowi *flp, struct sock *sk, int flags);
diff --git a/net/core/dev.c b/net/core/dev.c
index 5d131c2f84cc..bb37ee1e0901 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1353,7 +1353,7 @@ rollback:
 				nb->notifier_call(nb, NETDEV_DOWN, dev);
 			}
 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
-			nb->notifier_call(nb, NETDEV_UNREGISTER_PERNET, dev);
+			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
 		}
 	}
 
@@ -4771,8 +4771,7 @@ static void net_set_todo(struct net_device *dev)
 
 static void rollback_registered_many(struct list_head *head)
 {
-	struct net_device *dev, *aux, *fdev;
-	LIST_HEAD(pernet_list);
+	struct net_device *dev;
 
 	BUG_ON(dev_boot_phase);
 	ASSERT_RTNL();
@@ -4828,26 +4827,14 @@ static void rollback_registered_many(struct list_head *head)
 		netdev_unregister_kobject(dev);
 	}
 
-	synchronize_net();
+	/* Process any work delayed until the end of the batch */
+	dev = list_entry(head->next, struct net_device, unreg_list);
+	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
 
-	list_for_each_entry_safe(dev, aux, head, unreg_list) {
-		int new_net = 1;
-		list_for_each_entry(fdev, &pernet_list, unreg_list) {
-			if (net_eq(dev_net(dev), dev_net(fdev))) {
-				new_net = 0;
-				dev_put(dev);
-				break;
-			}
-		}
-		if (new_net)
-			list_move(&dev->unreg_list, &pernet_list);
-	}
+	synchronize_net();
 
-	list_for_each_entry_safe(dev, aux, &pernet_list, unreg_list) {
-		call_netdevice_notifiers(NETDEV_UNREGISTER_PERNET, dev);
-		list_move(&dev->unreg_list, head);
+	list_for_each_entry(dev, head, unreg_list)
 		dev_put(dev);
-	}
 }
 
 static void rollback_registered(struct net_device *dev)
@@ -5129,7 +5116,7 @@ static void netdev_wait_allrefs(struct net_device *dev)
 
 			/* Rebroadcast unregister notification */
 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
-			/* don't resend NETDEV_UNREGISTER_PERNET, _PERNET users
+			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
 			 * should have already handle it the first time */
 
 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
@@ -5442,11 +5429,6 @@ EXPORT_SYMBOL(unregister_netdevice_queue);
 /**
  *	unregister_netdevice_many - unregister many devices
  *	@head: list of devices
- *
- *	WARNING: Calling this modifies the given list
- *	(in rollback_registered_many). It may change the order of the elements
- *	in the list. However, you can assume it does not add or delete elements
- *	to/from the list.
  */
 void unregister_netdevice_many(struct list_head *head)
 {
@@ -5555,7 +5537,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	   this device. They should clean all the things.
 	*/
 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
-	call_netdevice_notifiers(NETDEV_UNREGISTER_PERNET, dev);
+	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
 
 	/*
 	 *	Flush the unicast and multicast chains
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 6c1e56aef1f4..3b373a8b0473 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -959,9 +959,11 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
 		break;
 	case NETDEV_CHANGEMTU:
 	case NETDEV_CHANGE:
-	case NETDEV_UNREGISTER_PERNET:
 		rt_cache_flush(dev_net(dev), 0);
 		break;
+	case NETDEV_UNREGISTER_BATCH:
+		rt_cache_flush_batch();
+		break;
 	}
 	return NOTIFY_DONE;
 }
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 9889fbd96487..90cdcfc32937 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -900,6 +900,12 @@ void rt_cache_flush(struct net *net, int delay)
 		rt_do_flush(!in_softirq());
 }
 
+/* Flush previous cache invalidated entries from the cache */
+void rt_cache_flush_batch(void)
+{
+	rt_do_flush(!in_softirq());
+}
+
 /*
  * We change rt_genid and let gc do the cleanup
  */
-- 
cgit v1.2.3


From dcbccbd4f1f6ad0f0e169d4b2e816e42bde06f82 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 29 Nov 2009 22:25:26 +0000
Subject: net: Implement for_each_netdev_reverse.

I will need this shortly to implement network namespace shutdown
batching.  For sanity sake network devices should be removed in
the reverse order they were created in.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9428793775a0..daf13d367498 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1112,6 +1112,8 @@ extern rwlock_t				dev_base_lock;		/* Device list lock */
 
 #define for_each_netdev(net, d)		\
 		list_for_each_entry(d, &(net)->dev_base_head, dev_list)
+#define for_each_netdev_reverse(net, d)	\
+		list_for_each_entry_reverse(d, &(net)->dev_base_head, dev_list)
 #define for_each_netdev_rcu(net, d)		\
 		list_for_each_entry_rcu(d, &(net)->dev_base_head, dev_list)
 #define for_each_netdev_safe(net, d, n)	\
-- 
cgit v1.2.3


From 8592e6486a177a02f048567cb928bc3a1f9a86c3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 2 Dec 2009 12:56:46 +0900
Subject: sched: Revert 498657a478c60be092208422fefa9c7b248729c2

498657a478c60be092208422fefa9c7b248729c2 incorrectly assumed
that preempt wasn't disabled around context_switch() and thus
was fixing imaginary problem.  It also broke KVM because it
depended on ->sched_in() to be called with irq enabled so that
it can do smp calls from there.

Revert the incorrect commit and add comment describing different
contexts under with the two callbacks are invoked.

Avi: spotted transposed in/out in the added comment.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Avi Kivity <avi@redhat.com>
Cc: peterz@infradead.org
Cc: efault@gmx.de
Cc: rusty@rustcorp.com.au
LKML-Reference: <1259726212-30259-2-git-send-email-tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/preempt.h | 5 +++++
 kernel/sched.c          | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 72b1a10a59b6..2e681d9555bd 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -105,6 +105,11 @@ struct preempt_notifier;
  * @sched_out: we've just been preempted
  *    notifier: struct preempt_notifier for the task being preempted
  *    next: the task that's kicking us out
+ *
+ * Please note that sched_in and out are called under different
+ * contexts.  sched_out is called with rq lock held and irq disabled
+ * while sched_in is called without rq lock and irq enabled.  This
+ * difference is intentional and depended upon by its users.
  */
 struct preempt_ops {
 	void (*sched_in)(struct preempt_notifier *notifier, int cpu);
diff --git a/kernel/sched.c b/kernel/sched.c
index b3d4e2be95aa..1031cae39c4c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2768,9 +2768,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	prev_state = prev->state;
 	finish_arch_switch(prev);
 	perf_event_task_sched_in(current, cpu_of(rq));
-	fire_sched_in_preempt_notifiers(current);
 	finish_lock_switch(rq, prev);
 
+	fire_sched_in_preempt_notifiers(current);
 	if (mm)
 		mmdrop(mm);
 	if (unlikely(prev_state == TASK_DEAD)) {
-- 
cgit v1.2.3


From 6b62fe019e39edfd1dbe3f224ecd0a87d9365223 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 2 Dec 2009 07:23:10 +0100
Subject: tracing/syscalls: Make syscall events print callbacks static

enter_syscall_print_##sname and exit_syscall_print_##sname don't
need to have a global scope. Make them static.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <1259734990-9034-1-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/syscalls.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index c2df3a593236..e79e2f3ccc51 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -134,7 +134,7 @@ struct perf_event_attr;
 #define SYSCALL_TRACE_ENTER_EVENT(sname)				\
 	static const struct syscall_metadata __syscall_meta_##sname;	\
 	static struct ftrace_event_call event_enter_##sname;		\
-	struct trace_event enter_syscall_print_##sname = {		\
+	static struct trace_event enter_syscall_print_##sname = {	\
 		.trace                  = print_syscall_enter,		\
 	};								\
 	static struct ftrace_event_call __used				\
@@ -156,7 +156,7 @@ struct perf_event_attr;
 #define SYSCALL_TRACE_EXIT_EVENT(sname)					\
 	static const struct syscall_metadata __syscall_meta_##sname;	\
 	static struct ftrace_event_call event_exit_##sname;		\
-	struct trace_event exit_syscall_print_##sname = {		\
+	static struct trace_event exit_syscall_print_##sname = {	\
 		.trace                  = print_syscall_exit,		\
 	};								\
 	static struct ftrace_event_call __used				\
-- 
cgit v1.2.3


From fa1452e808732ae10e8b1267fd75fc2d028d634b Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Mon, 30 Nov 2009 14:59:44 +0900
Subject: locking, task_struct: Reduce size on TRACE_IRQFLAGS and 64bit

Reorder task_struct field for TRACE_IRQFLAGS to remove padding
on 64-bit.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4B135F50.8070302@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75e6e60bf583..49be8f7c05f6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1421,17 +1421,17 @@ struct task_struct {
 #endif
 #ifdef CONFIG_TRACE_IRQFLAGS
 	unsigned int irq_events;
-	int hardirqs_enabled;
 	unsigned long hardirq_enable_ip;
-	unsigned int hardirq_enable_event;
 	unsigned long hardirq_disable_ip;
+	unsigned int hardirq_enable_event;
 	unsigned int hardirq_disable_event;
-	int softirqs_enabled;
+	int hardirqs_enabled;
+	int hardirq_context;
 	unsigned long softirq_disable_ip;
-	unsigned int softirq_disable_event;
 	unsigned long softirq_enable_ip;
+	unsigned int softirq_disable_event;
 	unsigned int softirq_enable_event;
-	int hardirq_context;
+	int softirqs_enabled;
 	int softirq_context;
 #endif
 #ifdef CONFIG_LOCKDEP
-- 
cgit v1.2.3


From d99ca3b977fc5a93141304f571475c2af9e6c1c5 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Wed, 2 Dec 2009 17:26:47 +0900
Subject: sched, cputime: Cleanups related to task_times()

- Remove if({u,s}t)s because no one call it with NULL now.
- Use cputime_{add,sub}().
- Add ifndef-endif for prev_{u,s}time since they are used
  only when !VIRT_CPU_ACCOUNTING.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
LKML-Reference: <4B1624C7.7040302@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  2 ++
 kernel/fork.c         |  2 ++
 kernel/sched.c        | 16 ++++++----------
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0395b0f4df3a..dff85e58264e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1331,7 +1331,9 @@ struct task_struct {
 
 	cputime_t utime, stime, utimescaled, stimescaled;
 	cputime_t gtime;
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
 	cputime_t prev_utime, prev_stime;
+#endif
 	unsigned long nvcsw, nivcsw; /* context switch counts */
 	struct timespec start_time; 		/* monotonic time */
 	struct timespec real_start_time;	/* boot based time */
diff --git a/kernel/fork.c b/kernel/fork.c
index 166b8c49257c..ad7cb6d1193c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1066,8 +1066,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->gtime = cputime_zero;
 	p->utimescaled = cputime_zero;
 	p->stimescaled = cputime_zero;
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
 	p->prev_utime = cputime_zero;
 	p->prev_stime = cputime_zero;
+#endif
 
 	p->default_timer_slack_ns = current->timer_slack_ns;
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 4883fee99314..17e2c1db2bde 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5184,10 +5184,8 @@ void account_idle_ticks(unsigned long ticks)
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
-	if (ut)
-		*ut = p->utime;
-	if (st)
-		*st = p->stime;
+	*ut = p->utime;
+	*st = p->stime;
 }
 #else
 
@@ -5197,7 +5195,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 
 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
-	cputime_t rtime, utime = p->utime, total = utime + p->stime;
+	cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
 
 	/*
 	 * Use CFS's precise accounting:
@@ -5217,12 +5215,10 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	 * Compare with previous values, to keep monotonicity:
 	 */
 	p->prev_utime = max(p->prev_utime, utime);
-	p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
+	p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
 
-	if (ut)
-		*ut = p->prev_utime;
-	if (st)
-		*st = p->prev_stime;
+	*ut = p->prev_utime;
+	*st = p->prev_stime;
 }
 #endif
 
-- 
cgit v1.2.3


From 0cf55e1ec08bb5a22e068309e2d8ba1180ab4239 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Wed, 2 Dec 2009 17:28:07 +0900
Subject: sched, cputime: Introduce thread_group_times()

This is a real fix for problem of utime/stime values decreasing
described in the thread:

   http://lkml.org/lkml/2009/11/3/522

Now cputime is accounted in the following way:

 - {u,s}time in task_struct are increased every time when the thread
   is interrupted by a tick (timer interrupt).

 - When a thread exits, its {u,s}time are added to signal->{u,s}time,
   after adjusted by task_times().

 - When all threads in a thread_group exits, accumulated {u,s}time
   (and also c{u,s}time) in signal struct are added to c{u,s}time
   in signal struct of the group's parent.

So {u,s}time in task struct are "raw" tick count, while
{u,s}time and c{u,s}time in signal struct are "adjusted" values.

And accounted values are used by:

 - task_times(), to get cputime of a thread:
   This function returns adjusted values that originates from raw
   {u,s}time and scaled by sum_exec_runtime that accounted by CFS.

 - thread_group_cputime(), to get cputime of a thread group:
   This function returns sum of all {u,s}time of living threads in
   the group, plus {u,s}time in the signal struct that is sum of
   adjusted cputimes of all exited threads belonged to the group.

The problem is the return value of thread_group_cputime(),
because it is mixed sum of "raw" value and "adjusted" value:

  group's {u,s}time = foreach(thread){{u,s}time} + exited({u,s}time)

This misbehavior can break {u,s}time monotonicity.
Assume that if there is a thread that have raw values greater
than adjusted values (e.g. interrupted by 1000Hz ticks 50 times
but only runs 45ms) and if it exits, cputime will decrease (e.g.
-5ms).

To fix this, we could do:

  group's {u,s}time = foreach(t){task_times(t)} + exited({u,s}time)

But task_times() contains hard divisions, so applying it for
every thread should be avoided.

This patch fixes the above problem in the following way:

 - Modify thread's exit (= __exit_signal()) not to use task_times().
   It means {u,s}time in signal struct accumulates raw values instead
   of adjusted values.  As the result it makes thread_group_cputime()
   to return pure sum of "raw" values.

 - Introduce a new function thread_group_times(*task, *utime, *stime)
   that converts "raw" values of thread_group_cputime() to "adjusted"
   values, in same calculation procedure as task_times().

 - Modify group's exit (= wait_task_zombie()) to use this introduced
   thread_group_times().  It make c{u,s}time in signal struct to
   have adjusted values like before this patch.

 - Replace some thread_group_cputime() by thread_group_times().
   This replacements are only applied where conveys the "adjusted"
   cputime to users, and where already uses task_times() near by it.
   (i.e. sys_times(), getrusage(), and /proc/<PID>/stat.)

This patch have a positive side effect:

 - Before this patch, if a group contains many short-life threads
   (e.g. runs 0.9ms and not interrupted by ticks), the group's
   cputime could be invisible since thread's cputime was accumulated
   after adjusted: imagine adjustment function as adj(ticks, runtime),
     {adj(0, 0.9) + adj(0, 0.9) + ....} = {0 + 0 + ....} = 0.
   After this patch it will not happen because the adjustment is
   applied after accumulated.

v2:
 - remove if()s, put new variables into signal_struct.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Spencer Candland <spencer@bluehost.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
LKML-Reference: <4B162517.8040909@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/array.c       |  5 +----
 include/linux/sched.h |  4 ++++
 kernel/exit.c         | 23 ++++++++++++-----------
 kernel/fork.c         |  3 +++
 kernel/sched.c        | 41 +++++++++++++++++++++++++++++++++++++++++
 kernel/sys.c          | 18 ++++++++----------
 6 files changed, 69 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index ca61a88aed66..2571da43c736 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -506,7 +506,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 		/* add up live thread stats at the group level */
 		if (whole) {
-			struct task_cputime cputime;
 			struct task_struct *t = task;
 			do {
 				min_flt += t->min_flt;
@@ -517,9 +516,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 			min_flt += sig->min_flt;
 			maj_flt += sig->maj_flt;
-			thread_group_cputime(task, &cputime);
-			utime = cputime.utime;
-			stime = cputime.stime;
+			thread_group_times(task, &utime, &stime);
 			gtime = cputime_add(gtime, sig->gtime);
 		}
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dff85e58264e..34238bd10ebf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -624,6 +624,9 @@ struct signal_struct {
 	cputime_t utime, stime, cutime, cstime;
 	cputime_t gtime;
 	cputime_t cgtime;
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+	cputime_t prev_utime, prev_stime;
+#endif
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 	unsigned long inblock, oublock, cinblock, coublock;
@@ -1723,6 +1726,7 @@ static inline void put_task_struct(struct task_struct *t)
 }
 
 extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
+extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
 
 /*
  * Per process flags
diff --git a/kernel/exit.c b/kernel/exit.c
index 2eaf68b634e3..b221ad65fd20 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -91,8 +91,6 @@ static void __exit_signal(struct task_struct *tsk)
 	if (atomic_dec_and_test(&sig->count))
 		posix_cpu_timers_exit_group(tsk);
 	else {
-		cputime_t utime, stime;
-
 		/*
 		 * If there is any task waiting for the group exit
 		 * then notify it:
@@ -112,9 +110,8 @@ static void __exit_signal(struct task_struct *tsk)
 		 * We won't ever get here for the group leader, since it
 		 * will have been the last reference on the signal_struct.
 		 */
-		task_times(tsk, &utime, &stime);
-		sig->utime = cputime_add(sig->utime, utime);
-		sig->stime = cputime_add(sig->stime, stime);
+		sig->utime = cputime_add(sig->utime, tsk->utime);
+		sig->stime = cputime_add(sig->stime, tsk->stime);
 		sig->gtime = cputime_add(sig->gtime, tsk->gtime);
 		sig->min_flt += tsk->min_flt;
 		sig->maj_flt += tsk->maj_flt;
@@ -1208,6 +1205,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		struct signal_struct *psig;
 		struct signal_struct *sig;
 		unsigned long maxrss;
+		cputime_t tgutime, tgstime;
 
 		/*
 		 * The resource counters for the group leader are in its
@@ -1223,20 +1221,23 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		 * need to protect the access to parent->signal fields,
 		 * as other threads in the parent group can be right
 		 * here reaping other children at the same time.
+		 *
+		 * We use thread_group_times() to get times for the thread
+		 * group, which consolidates times for all threads in the
+		 * group including the group leader.
 		 */
+		thread_group_times(p, &tgutime, &tgstime);
 		spin_lock_irq(&p->real_parent->sighand->siglock);
 		psig = p->real_parent->signal;
 		sig = p->signal;
 		psig->cutime =
 			cputime_add(psig->cutime,
-			cputime_add(p->utime,
-			cputime_add(sig->utime,
-				    sig->cutime)));
+			cputime_add(tgutime,
+				    sig->cutime));
 		psig->cstime =
 			cputime_add(psig->cstime,
-			cputime_add(p->stime,
-			cputime_add(sig->stime,
-				    sig->cstime)));
+			cputime_add(tgstime,
+				    sig->cstime));
 		psig->cgtime =
 			cputime_add(psig->cgtime,
 			cputime_add(p->gtime,
diff --git a/kernel/fork.c b/kernel/fork.c
index ad7cb6d1193c..3d6f121bbe8a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -884,6 +884,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
 	sig->gtime = cputime_zero;
 	sig->cgtime = cputime_zero;
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+	sig->prev_utime = sig->prev_stime = cputime_zero;
+#endif
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 17e2c1db2bde..e6ba726941ae 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5187,6 +5187,16 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	*ut = p->utime;
 	*st = p->stime;
 }
+
+void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	struct task_cputime cputime;
+
+	thread_group_cputime(p, &cputime);
+
+	*ut = cputime.utime;
+	*st = cputime.stime;
+}
 #else
 
 #ifndef nsecs_to_cputime
@@ -5220,6 +5230,37 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	*ut = p->prev_utime;
 	*st = p->prev_stime;
 }
+
+/*
+ * Must be called with siglock held.
+ */
+void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	struct signal_struct *sig = p->signal;
+	struct task_cputime cputime;
+	cputime_t rtime, utime, total;
+
+	thread_group_cputime(p, &cputime);
+
+	total = cputime_add(cputime.utime, cputime.stime);
+	rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
+
+	if (total) {
+		u64 temp;
+
+		temp = (u64)(rtime * cputime.utime);
+		do_div(temp, total);
+		utime = (cputime_t)temp;
+	} else
+		utime = rtime;
+
+	sig->prev_utime = max(sig->prev_utime, utime);
+	sig->prev_stime = max(sig->prev_stime,
+			      cputime_sub(rtime, sig->prev_utime));
+
+	*ut = sig->prev_utime;
+	*st = sig->prev_stime;
+}
 #endif
 
 /*
diff --git a/kernel/sys.c b/kernel/sys.c
index bbdfce0d4347..9968c5fb55b9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -911,16 +911,15 @@ change_okay:
 
 void do_sys_times(struct tms *tms)
 {
-	struct task_cputime cputime;
-	cputime_t cutime, cstime;
+	cputime_t tgutime, tgstime, cutime, cstime;
 
-	thread_group_cputime(current, &cputime);
 	spin_lock_irq(&current->sighand->siglock);
+	thread_group_times(current, &tgutime, &tgstime);
 	cutime = current->signal->cutime;
 	cstime = current->signal->cstime;
 	spin_unlock_irq(&current->sighand->siglock);
-	tms->tms_utime = cputime_to_clock_t(cputime.utime);
-	tms->tms_stime = cputime_to_clock_t(cputime.stime);
+	tms->tms_utime = cputime_to_clock_t(tgutime);
+	tms->tms_stime = cputime_to_clock_t(tgstime);
 	tms->tms_cutime = cputime_to_clock_t(cutime);
 	tms->tms_cstime = cputime_to_clock_t(cstime);
 }
@@ -1338,8 +1337,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 {
 	struct task_struct *t;
 	unsigned long flags;
-	cputime_t utime, stime;
-	struct task_cputime cputime;
+	cputime_t tgutime, tgstime, utime, stime;
 	unsigned long maxrss = 0;
 
 	memset((char *) r, 0, sizeof *r);
@@ -1372,9 +1370,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 				break;
 
 		case RUSAGE_SELF:
-			thread_group_cputime(p, &cputime);
-			utime = cputime_add(utime, cputime.utime);
-			stime = cputime_add(stime, cputime.stime);
+			thread_group_times(p, &tgutime, &tgstime);
+			utime = cputime_add(utime, tgutime);
+			stime = cputime_add(stime, tgstime);
 			r->ru_nvcsw += p->signal->nvcsw;
 			r->ru_nivcsw += p->signal->nivcsw;
 			r->ru_minflt += p->signal->min_flt;
-- 
cgit v1.2.3


From c81c2d95449cd218c2022ce6014c52fef1eb1f66 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Wed, 2 Dec 2009 16:49:02 +0000
Subject: skbuff: remove skb_dma_map/unmap

The two functions skb_dma_map/unmap are unsafe to use as they cause
problems when packets are cloned and sent to multiple devices while a HW
IOMMU is enabled.  Due to this it is best to remove the code so it is not
used by any other network driver maintainters.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  8 -------
 net/core/Makefile      |  1 -
 net/core/skb_dma_map.c | 65 --------------------------------------------------
 3 files changed, 74 deletions(-)
 delete mode 100644 net/core/skb_dma_map.c

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 89eed8cdd318..ae836fded530 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -416,14 +416,6 @@ struct sk_buff {
 
 #include <asm/system.h>
 
-#ifdef CONFIG_HAS_DMA
-#include <linux/dma-mapping.h>
-extern int skb_dma_map(struct device *dev, struct sk_buff *skb,
-		       enum dma_data_direction dir);
-extern void skb_dma_unmap(struct device *dev, struct sk_buff *skb,
-			  enum dma_data_direction dir);
-#endif
-
 static inline struct dst_entry *skb_dst(const struct sk_buff *skb)
 {
 	return (struct dst_entry *)skb->_skb_dst;
diff --git a/net/core/Makefile b/net/core/Makefile
index 796f46eece5f..08791ac3e05a 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -6,7 +6,6 @@ obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
 	 gen_stats.o gen_estimator.o net_namespace.o
 
 obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
-obj-$(CONFIG_HAS_DMA) += skb_dma_map.o
 
 obj-y		     += dev.o ethtool.o dev_mcast.o dst.o netevent.o \
 			neighbour.o rtnetlink.o utils.o link_watch.o filter.o
diff --git a/net/core/skb_dma_map.c b/net/core/skb_dma_map.c
deleted file mode 100644
index 79687dfd6957..000000000000
--- a/net/core/skb_dma_map.c
+++ /dev/null
@@ -1,65 +0,0 @@
-/* skb_dma_map.c: DMA mapping helpers for socket buffers.
- *
- * Copyright (C) David S. Miller <davem@davemloft.net>
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/dma-mapping.h>
-#include <linux/skbuff.h>
-
-int skb_dma_map(struct device *dev, struct sk_buff *skb,
-		enum dma_data_direction dir)
-{
-	struct skb_shared_info *sp = skb_shinfo(skb);
-	dma_addr_t map;
-	int i;
-
-	map = dma_map_single(dev, skb->data,
-			     skb_headlen(skb), dir);
-	if (dma_mapping_error(dev, map))
-		goto out_err;
-
-	sp->dma_head = map;
-	for (i = 0; i < sp->nr_frags; i++) {
-		skb_frag_t *fp = &sp->frags[i];
-
-		map = dma_map_page(dev, fp->page, fp->page_offset,
-				   fp->size, dir);
-		if (dma_mapping_error(dev, map))
-			goto unwind;
-		sp->dma_maps[i] = map;
-	}
-
-	return 0;
-
-unwind:
-	while (--i >= 0) {
-		skb_frag_t *fp = &sp->frags[i];
-
-		dma_unmap_page(dev, sp->dma_maps[i],
-			       fp->size, dir);
-	}
-	dma_unmap_single(dev, sp->dma_head,
-			 skb_headlen(skb), dir);
-out_err:
-	return -ENOMEM;
-}
-EXPORT_SYMBOL(skb_dma_map);
-
-void skb_dma_unmap(struct device *dev, struct sk_buff *skb,
-		   enum dma_data_direction dir)
-{
-	struct skb_shared_info *sp = skb_shinfo(skb);
-	int i;
-
-	dma_unmap_single(dev, sp->dma_head,
-			 skb_headlen(skb), dir);
-	for (i = 0; i < sp->nr_frags; i++) {
-		skb_frag_t *fp = &sp->frags[i];
-
-		dma_unmap_page(dev, sp->dma_maps[i],
-			       fp->size, dir);
-	}
-}
-EXPORT_SYMBOL(skb_dma_unmap);
-- 
cgit v1.2.3


From da5c78c82629a167794436e4306b4cf1faddea90 Mon Sep 17 00:00:00 2001
From: William Allen Simpson <william.allen.simpson@gmail.com>
Date: Wed, 2 Dec 2009 18:12:09 +0000
Subject: TCPCT part 1b: generate Responder Cookie secret

Define (missing) hash message size for SHA1.

Define hashing size constants specific to TCP cookies.

Add new function: tcp_cookie_generator().

Maintain global secret values for tcp_cookie_generator().

This is a significantly revised implementation of earlier (15-year-old)
Photuris [RFC-2522] code for the KA9Q cooperative multitasking platform.

Linux RCU technique appears to be well-suited to this application, though
neither of the circular queue items are freed.

These functions will also be used in subsequent patches that implement
additional features.

Signed-off-by: William.Allen.Simpson@gmail.com
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/cryptohash.h |   1 +
 include/net/tcp.h          |   8 +++
 net/ipv4/tcp.c             | 140 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 149 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h
index c118b2ad9807..ec78a4bbe1d5 100644
--- a/include/linux/cryptohash.h
+++ b/include/linux/cryptohash.h
@@ -2,6 +2,7 @@
 #define __CRYPTOHASH_H
 
 #define SHA_DIGEST_WORDS 5
+#define SHA_MESSAGE_BYTES (512 /*bits*/ / 8)
 #define SHA_WORKSPACE_WORDS 80
 
 void sha_init(__u32 *buf);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index ec183fda05d0..4a99a8e39121 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1478,6 +1478,14 @@ struct tcp_request_sock_ops {
 #endif
 };
 
+/* Using SHA1 for now, define some constants.
+ */
+#define COOKIE_DIGEST_WORDS (SHA_DIGEST_WORDS)
+#define COOKIE_MESSAGE_WORDS (SHA_MESSAGE_BYTES / 4)
+#define COOKIE_WORKSPACE_WORDS (COOKIE_DIGEST_WORDS + COOKIE_MESSAGE_WORDS)
+
+extern int tcp_cookie_generator(u32 *bakery);
+
 extern void tcp_v4_init(void);
 extern void tcp_init(void);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7d4648f8b3d3..ba03ac80435a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -264,6 +264,7 @@
 #include <linux/cache.h>
 #include <linux/err.h>
 #include <linux/crypto.h>
+#include <linux/time.h>
 
 #include <net/icmp.h>
 #include <net/tcp.h>
@@ -2848,6 +2849,135 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
 
 #endif
 
+/**
+ * Each Responder maintains up to two secret values concurrently for
+ * efficient secret rollover.  Each secret value has 4 states:
+ *
+ * Generating.  (tcp_secret_generating != tcp_secret_primary)
+ *    Generates new Responder-Cookies, but not yet used for primary
+ *    verification.  This is a short-term state, typically lasting only
+ *    one round trip time (RTT).
+ *
+ * Primary.  (tcp_secret_generating == tcp_secret_primary)
+ *    Used both for generation and primary verification.
+ *
+ * Retiring.  (tcp_secret_retiring != tcp_secret_secondary)
+ *    Used for verification, until the first failure that can be
+ *    verified by the newer Generating secret.  At that time, this
+ *    cookie's state is changed to Secondary, and the Generating
+ *    cookie's state is changed to Primary.  This is a short-term state,
+ *    typically lasting only one round trip time (RTT).
+ *
+ * Secondary.  (tcp_secret_retiring == tcp_secret_secondary)
+ *    Used for secondary verification, after primary verification
+ *    failures.  This state lasts no more than twice the Maximum Segment
+ *    Lifetime (2MSL).  Then, the secret is discarded.
+ */
+struct tcp_cookie_secret {
+	/* The secret is divided into two parts.  The digest part is the
+	 * equivalent of previously hashing a secret and saving the state,
+	 * and serves as an initialization vector (IV).  The message part
+	 * serves as the trailing secret.
+	 */
+	u32				secrets[COOKIE_WORKSPACE_WORDS];
+	unsigned long			expires;
+};
+
+#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL)
+#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2)
+#define TCP_SECRET_LIFE (HZ * 600)
+
+static struct tcp_cookie_secret tcp_secret_one;
+static struct tcp_cookie_secret tcp_secret_two;
+
+/* Essentially a circular list, without dynamic allocation. */
+static struct tcp_cookie_secret *tcp_secret_generating;
+static struct tcp_cookie_secret *tcp_secret_primary;
+static struct tcp_cookie_secret *tcp_secret_retiring;
+static struct tcp_cookie_secret *tcp_secret_secondary;
+
+static DEFINE_SPINLOCK(tcp_secret_locker);
+
+/* Select a pseudo-random word in the cookie workspace.
+ */
+static inline u32 tcp_cookie_work(const u32 *ws, const int n)
+{
+	return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])];
+}
+
+/* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed.
+ * Called in softirq context.
+ * Returns: 0 for success.
+ */
+int tcp_cookie_generator(u32 *bakery)
+{
+	unsigned long jiffy = jiffies;
+
+	if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) {
+		spin_lock_bh(&tcp_secret_locker);
+		if (!time_after_eq(jiffy, tcp_secret_generating->expires)) {
+			/* refreshed by another */
+			memcpy(bakery,
+			       &tcp_secret_generating->secrets[0],
+			       COOKIE_WORKSPACE_WORDS);
+		} else {
+			/* still needs refreshing */
+			get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS);
+
+			/* The first time, paranoia assumes that the
+			 * randomization function isn't as strong.  But,
+			 * this secret initialization is delayed until
+			 * the last possible moment (packet arrival).
+			 * Although that time is observable, it is
+			 * unpredictably variable.  Mash in the most
+			 * volatile clock bits available, and expire the
+			 * secret extra quickly.
+			 */
+			if (unlikely(tcp_secret_primary->expires ==
+				     tcp_secret_secondary->expires)) {
+				struct timespec tv;
+
+				getnstimeofday(&tv);
+				bakery[COOKIE_DIGEST_WORDS+0] ^=
+					(u32)tv.tv_nsec;
+
+				tcp_secret_secondary->expires = jiffy
+					+ TCP_SECRET_1MSL
+					+ (0x0f & tcp_cookie_work(bakery, 0));
+			} else {
+				tcp_secret_secondary->expires = jiffy
+					+ TCP_SECRET_LIFE
+					+ (0xff & tcp_cookie_work(bakery, 1));
+				tcp_secret_primary->expires = jiffy
+					+ TCP_SECRET_2MSL
+					+ (0x1f & tcp_cookie_work(bakery, 2));
+			}
+			memcpy(&tcp_secret_secondary->secrets[0],
+			       bakery, COOKIE_WORKSPACE_WORDS);
+
+			rcu_assign_pointer(tcp_secret_generating,
+					   tcp_secret_secondary);
+			rcu_assign_pointer(tcp_secret_retiring,
+					   tcp_secret_primary);
+			/*
+			 * Neither call_rcu() nor synchronize_rcu() needed.
+			 * Retiring data is not freed.  It is replaced after
+			 * further (locked) pointer updates, and a quiet time
+			 * (minimum 1MSL, maximum LIFE - 2MSL).
+			 */
+		}
+		spin_unlock_bh(&tcp_secret_locker);
+	} else {
+		rcu_read_lock_bh();
+		memcpy(bakery,
+		       &rcu_dereference(tcp_secret_generating)->secrets[0],
+		       COOKIE_WORKSPACE_WORDS);
+		rcu_read_unlock_bh();
+	}
+	return 0;
+}
+EXPORT_SYMBOL(tcp_cookie_generator);
+
 void tcp_done(struct sock *sk)
 {
 	if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
@@ -2882,6 +3012,7 @@ void __init tcp_init(void)
 	struct sk_buff *skb = NULL;
 	unsigned long nr_pages, limit;
 	int order, i, max_share;
+	unsigned long jiffy = jiffies;
 
 	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
 
@@ -2975,6 +3106,15 @@ void __init tcp_init(void)
 	       tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
 
 	tcp_register_congestion_control(&tcp_reno);
+
+	memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
+	memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets));
+	tcp_secret_one.expires = jiffy; /* past due */
+	tcp_secret_two.expires = jiffy; /* past due */
+	tcp_secret_generating = &tcp_secret_one;
+	tcp_secret_primary = &tcp_secret_one;
+	tcp_secret_retiring = &tcp_secret_two;
+	tcp_secret_secondary = &tcp_secret_two;
 }
 
 EXPORT_SYMBOL(tcp_close);
-- 
cgit v1.2.3


From 519855c508b9a17878c0977a3cdefc09b59b30df Mon Sep 17 00:00:00 2001
From: William Allen Simpson <william.allen.simpson@gmail.com>
Date: Wed, 2 Dec 2009 18:14:19 +0000
Subject: TCPCT part 1c: sysctl_tcp_cookie_size, socket option
 TCP_COOKIE_TRANSACTIONS

Define sysctl (tcp_cookie_size) to turn on and off the cookie option
default globally, instead of a compiled configuration option.

Define per socket option (TCP_COOKIE_TRANSACTIONS) for setting constant
data values, retrieving variable cookie values, and other facilities.

Move inline tcp_clear_options() unchanged from net/tcp.h to linux/tcp.h,
near its corresponding struct tcp_options_received (prior to changes).

This is a straightforward re-implementation of an earlier (year-old)
patch that no longer applies cleanly, with permission of the original
author (Adam Langley):

    http://thread.gmane.org/gmane.linux.network/102586

These functions will also be used in subsequent patches that implement
additional features.

Requires:
   net: TCP_MSS_DEFAULT, TCP_MSS_DESIRED

Signed-off-by: William.Allen.Simpson@gmail.com
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  8 ++++++++
 include/linux/tcp.h                    | 33 ++++++++++++++++++++++++++++++++-
 include/net/tcp.h                      |  6 +-----
 net/ipv4/sysctl_net_ipv4.c             |  8 ++++++++
 net/ipv4/tcp_output.c                  |  3 +++
 5 files changed, 52 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 554440af675c..989f5538b8dd 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -164,6 +164,14 @@ tcp_congestion_control - STRING
 	additional choices may be available based on kernel configuration.
 	Default is set as part of kernel configuration.
 
+tcp_cookie_size - INTEGER
+	Default size of TCP Cookie Transactions (TCPCT) option, that may be
+	overridden on a per socket basis by the TCPCT socket option.
+	Values greater than the maximum (16) are interpreted as the maximum.
+	Values greater than zero and less than the minimum (8) are interpreted
+	as the minimum.  Odd values are interpreted as the next even value.
+	Default: 0 (off).
+
 tcp_dsack - BOOLEAN
 	Allows TCP to send "duplicate" SACKs.
 
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 32d7d77b4a01..eaa3113b3786 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -102,7 +102,9 @@ enum {
 #define TCP_QUICKACK		12	/* Block/reenable quick acks */
 #define TCP_CONGESTION		13	/* Congestion control algorithm */
 #define TCP_MD5SIG		14	/* TCP MD5 Signature (RFC2385) */
+#define TCP_COOKIE_TRANSACTIONS	15	/* TCP Cookie Transactions */
 
+/* for TCP_INFO socket option */
 #define TCPI_OPT_TIMESTAMPS	1
 #define TCPI_OPT_SACK		2
 #define TCPI_OPT_WSCALE		4
@@ -174,6 +176,30 @@ struct tcp_md5sig {
 	__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN];		/* key (binary) */
 };
 
+/* for TCP_COOKIE_TRANSACTIONS (TCPCT) socket option */
+#define TCP_COOKIE_MIN		 8		/*  64-bits */
+#define TCP_COOKIE_MAX		16		/* 128-bits */
+#define TCP_COOKIE_PAIR_SIZE	(2*TCP_COOKIE_MAX)
+
+/* Flags for both getsockopt and setsockopt */
+#define TCP_COOKIE_IN_ALWAYS	(1 << 0)	/* Discard SYN without cookie */
+#define TCP_COOKIE_OUT_NEVER	(1 << 1)	/* Prohibit outgoing cookies,
+						 * supercedes everything. */
+
+/* Flags for getsockopt */
+#define TCP_S_DATA_IN		(1 << 2)	/* Was data received? */
+#define TCP_S_DATA_OUT		(1 << 3)	/* Was data sent? */
+
+/* TCP_COOKIE_TRANSACTIONS data */
+struct tcp_cookie_transactions {
+	__u16	tcpct_flags;			/* see above */
+	__u8	__tcpct_pad1;			/* zero */
+	__u8	tcpct_cookie_desired;		/* bytes */
+	__u16	tcpct_s_data_desired;		/* bytes of variable data */
+	__u16	tcpct_used;			/* bytes in value */
+	__u8	tcpct_value[TCP_MSS_DEFAULT];
+};
+
 #ifdef __KERNEL__
 
 #include <linux/skbuff.h>
@@ -227,6 +253,11 @@ struct tcp_options_received {
 	u16	mss_clamp;	/* Maximal mss, negotiated at connection setup */
 };
 
+static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
+{
+	rx_opt->tstamp_ok = rx_opt->sack_ok = rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
+}
+
 /* This is the max number of SACKS that we'll generate and process. It's safe
  * to increse this, although since:
  *   size = TCPOLEN_SACK_BASE_ALIGNED (4) + n * TCPOLEN_SACK_PERBLOCK (8)
@@ -435,6 +466,6 @@ static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
 	return (struct tcp_timewait_sock *)sk;
 }
 
-#endif
+#endif	/* __KERNEL__ */
 
 #endif	/* _LINUX_TCP_H */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 4a99a8e39121..738b65f01e26 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -234,6 +234,7 @@ extern int sysctl_tcp_base_mss;
 extern int sysctl_tcp_workaround_signed_windows;
 extern int sysctl_tcp_slow_start_after_idle;
 extern int sysctl_tcp_max_ssthresh;
+extern int sysctl_tcp_cookie_size;
 
 extern atomic_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
@@ -340,11 +341,6 @@ static inline void tcp_dec_quickack_mode(struct sock *sk,
 
 extern void tcp_enter_quickack_mode(struct sock *sk);
 
-static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
-{
- 	rx_opt->tstamp_ok = rx_opt->sack_ok = rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
-}
-
 #define	TCP_ECN_OK		1
 #define	TCP_ECN_QUEUE_CWR	2
 #define	TCP_ECN_DEMAND_CWR	4
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index c00323bae044..13f7ab6ad6a0 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -712,6 +712,14 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "tcp_cookie_size",
+		.data		= &sysctl_tcp_cookie_size,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "udp_mem",
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b8b25049f257..307f318fe931 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -59,6 +59,9 @@ int sysctl_tcp_base_mss __read_mostly = 512;
 /* By default, RFC2861 behavior.  */
 int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
 
+int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
+
+
 /* Account for new data that has been sent to the network. */
 static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
 {
-- 
cgit v1.2.3


From 435cf559f02ea3a3159eb316f97dc88bdebe9432 Mon Sep 17 00:00:00 2001
From: William Allen Simpson <william.allen.simpson@gmail.com>
Date: Wed, 2 Dec 2009 18:17:05 +0000
Subject: TCPCT part 1d: define TCP cookie option, extend existing struct's

Data structures are carefully composed to require minimal additions.
For example, the struct tcp_options_received cookie_plus variable fits
between existing 16-bit and 8-bit variables, requiring no additional
space (taking alignment into consideration).  There are no additions to
tcp_request_sock, and only 1 pointer in tcp_sock.

This is a significantly revised implementation of an earlier (year-old)
patch that no longer applies cleanly, with permission of the original
author (Adam Langley):

    http://thread.gmane.org/gmane.linux.network/102586

The principle difference is using a TCP option to carry the cookie nonce,
instead of a user configured offset in the data.  This is more flexible and
less subject to user configuration error.  Such a cookie option has been
suggested for many years, and is also useful without SYN data, allowing
several related concepts to use the same extension option.

    "Re: SYN floods (was: does history repeat itself?)", September 9, 1996.
    http://www.merit.net/mail.archives/nanog/1996-09/msg00235.html

    "Re: what a new TCP header might look like", May 12, 1998.
    ftp://ftp.isi.edu/end2end/end2end-interest-1998.mail

These functions will also be used in subsequent patches that implement
additional features.

Requires:
   TCPCT part 1a: add request_values parameter for sending SYNACK
   TCPCT part 1b: generate Responder Cookie secret
   TCPCT part 1c: sysctl_tcp_cookie_size, socket option TCP_COOKIE_TRANSACTIONS

Signed-off-by: William.Allen.Simpson@gmail.com
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      | 29 +++++++++++++----
 include/net/tcp.h        | 83 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c      | 20 ++++++++++++
 net/ipv4/tcp_minisocks.c | 46 ++++++++++++++++++++++-----
 net/ipv6/tcp_ipv6.c      | 13 ++++++++
 5 files changed, 177 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index eaa3113b3786..7fee8a4df931 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -247,31 +247,38 @@ struct tcp_options_received {
 		sack_ok : 4,	/* SACK seen on SYN packet		*/
 		snd_wscale : 4,	/* Window scaling received from sender	*/
 		rcv_wscale : 4;	/* Window scaling to send to receiver	*/
-/*	SACKs data	*/
+	u8	cookie_plus:6,	/* bytes in authenticator/cookie option	*/
+		cookie_out_never:1,
+		cookie_in_always:1;
 	u8	num_sacks;	/* Number of SACK blocks		*/
-	u16	user_mss;  	/* mss requested by user in ioctl */
+	u16	user_mss;	/* mss requested by user in ioctl	*/
 	u16	mss_clamp;	/* Maximal mss, negotiated at connection setup */
 };
 
 static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
 {
-	rx_opt->tstamp_ok = rx_opt->sack_ok = rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
+	rx_opt->tstamp_ok = rx_opt->sack_ok = 0;
+	rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
+	rx_opt->cookie_plus = 0;
 }
 
 /* This is the max number of SACKS that we'll generate and process. It's safe
- * to increse this, although since:
+ * to increase this, although since:
  *   size = TCPOLEN_SACK_BASE_ALIGNED (4) + n * TCPOLEN_SACK_PERBLOCK (8)
  * only four options will fit in a standard TCP header */
 #define TCP_NUM_SACKS 4
 
+struct tcp_cookie_values;
+struct tcp_request_sock_ops;
+
 struct tcp_request_sock {
 	struct inet_request_sock 	req;
 #ifdef CONFIG_TCP_MD5SIG
 	/* Only used by TCP MD5 Signature so far. */
 	const struct tcp_request_sock_ops *af_specific;
 #endif
-	u32			 	rcv_isn;
-	u32			 	snt_isn;
+	u32				rcv_isn;
+	u32				snt_isn;
 };
 
 static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
@@ -441,6 +448,12 @@ struct tcp_sock {
 /* TCP MD5 Signature Option information */
 	struct tcp_md5sig_info	*md5sig_info;
 #endif
+
+	/* When the cookie options are generated and exchanged, then this
+	 * object holds a reference to them (cookie_values->kref).  Also
+	 * contains related tcp_cookie_transactions fields.
+	 */
+	struct tcp_cookie_values  *cookie_values;
 };
 
 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
@@ -459,6 +472,10 @@ struct tcp_timewait_sock {
 	u16			  tw_md5_keylen;
 	u8			  tw_md5_key[TCP_MD5SIG_MAXKEYLEN];
 #endif
+	/* Few sockets in timewait have cookies; in that case, then this
+	 * object holds a reference to them (tw_cookie_values->kref).
+	 */
+	struct tcp_cookie_values  *tw_cookie_values;
 };
 
 static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 738b65f01e26..f9abd9becabd 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -30,6 +30,7 @@
 #include <linux/dmaengine.h>
 #include <linux/crypto.h>
 #include <linux/cryptohash.h>
+#include <linux/kref.h>
 
 #include <net/inet_connection_sock.h>
 #include <net/inet_timewait_sock.h>
@@ -164,6 +165,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCPOPT_SACK             5       /* SACK Block */
 #define TCPOPT_TIMESTAMP	8	/* Better RTT estimations/PAWS */
 #define TCPOPT_MD5SIG		19	/* MD5 Signature (RFC2385) */
+#define TCPOPT_COOKIE		253	/* Cookie extension (experimental) */
 
 /*
  *     TCP option lengths
@@ -174,6 +176,10 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCPOLEN_SACK_PERM      2
 #define TCPOLEN_TIMESTAMP      10
 #define TCPOLEN_MD5SIG         18
+#define TCPOLEN_COOKIE_BASE    2	/* Cookie-less header extension */
+#define TCPOLEN_COOKIE_PAIR    3	/* Cookie pair header extension */
+#define TCPOLEN_COOKIE_MIN     (TCPOLEN_COOKIE_BASE+TCP_COOKIE_MIN)
+#define TCPOLEN_COOKIE_MAX     (TCPOLEN_COOKIE_BASE+TCP_COOKIE_MAX)
 
 /* But this is what stacks really send out. */
 #define TCPOLEN_TSTAMP_ALIGNED		12
@@ -1482,6 +1488,83 @@ struct tcp_request_sock_ops {
 
 extern int tcp_cookie_generator(u32 *bakery);
 
+/**
+ *	struct tcp_cookie_values - each socket needs extra space for the
+ *	cookies, together with (optional) space for any SYN data.
+ *
+ *	A tcp_sock contains a pointer to the current value, and this is
+ *	cloned to the tcp_timewait_sock.
+ *
+ * @cookie_pair:	variable data from the option exchange.
+ *
+ * @cookie_desired:	user specified tcpct_cookie_desired.  Zero
+ *			indicates default (sysctl_tcp_cookie_size).
+ *			After cookie sent, remembers size of cookie.
+ *			Range 0, TCP_COOKIE_MIN to TCP_COOKIE_MAX.
+ *
+ * @s_data_desired:	user specified tcpct_s_data_desired.  When the
+ *			constant payload is specified (@s_data_constant),
+ *			holds its length instead.
+ *			Range 0 to TCP_MSS_DESIRED.
+ *
+ * @s_data_payload:	constant data that is to be included in the
+ *			payload of SYN or SYNACK segments when the
+ *			cookie option is present.
+ */
+struct tcp_cookie_values {
+	struct kref	kref;
+	u8		cookie_pair[TCP_COOKIE_PAIR_SIZE];
+	u8		cookie_pair_size;
+	u8		cookie_desired;
+	u16		s_data_desired:11,
+			s_data_constant:1,
+			s_data_in:1,
+			s_data_out:1,
+			s_data_unused:2;
+	u8		s_data_payload[0];
+};
+
+static inline void tcp_cookie_values_release(struct kref *kref)
+{
+	kfree(container_of(kref, struct tcp_cookie_values, kref));
+}
+
+/* The length of constant payload data.  Note that s_data_desired is
+ * overloaded, depending on s_data_constant: either the length of constant
+ * data (returned here) or the limit on variable data.
+ */
+static inline int tcp_s_data_size(const struct tcp_sock *tp)
+{
+	return (tp->cookie_values != NULL && tp->cookie_values->s_data_constant)
+		? tp->cookie_values->s_data_desired
+		: 0;
+}
+
+/**
+ *	struct tcp_extend_values - tcp_ipv?.c to tcp_output.c workspace.
+ *
+ *	As tcp_request_sock has already been extended in other places, the
+ *	only remaining method is to pass stack values along as function
+ *	parameters.  These parameters are not needed after sending SYNACK.
+ *
+ * @cookie_bakery:	cryptographic secret and message workspace.
+ *
+ * @cookie_plus:	bytes in authenticator/cookie option, copied from
+ *			struct tcp_options_received (above).
+ */
+struct tcp_extend_values {
+	struct request_values		rv;
+	u32				cookie_bakery[COOKIE_WORKSPACE_WORDS];
+	u8				cookie_plus:6,
+					cookie_out_never:1,
+					cookie_in_always:1;
+};
+
+static inline struct tcp_extend_values *tcp_xv(struct request_values *rvp)
+{
+	return (struct tcp_extend_values *)rvp;
+}
+
 extern void tcp_v4_init(void);
 extern void tcp_init(void);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 649a36d99c73..a2bcac9b388e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1833,6 +1833,19 @@ static int tcp_v4_init_sock(struct sock *sk)
 	tp->af_specific = &tcp_sock_ipv4_specific;
 #endif
 
+	/* TCP Cookie Transactions */
+	if (sysctl_tcp_cookie_size > 0) {
+		/* Default, cookies without s_data_payload. */
+		tp->cookie_values =
+			kzalloc(sizeof(*tp->cookie_values),
+				sk->sk_allocation);
+		if (tp->cookie_values != NULL)
+			kref_init(&tp->cookie_values->kref);
+	}
+	/* Presumed zeroed, in order of appearance:
+	 *	cookie_in_always, cookie_out_never,
+	 *	s_data_constant, s_data_in, s_data_out
+	 */
 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
 
@@ -1886,6 +1899,13 @@ void tcp_v4_destroy_sock(struct sock *sk)
 		sk->sk_sndmsg_page = NULL;
 	}
 
+	/* TCP Cookie Transactions */
+	if (tp->cookie_values != NULL) {
+		kref_put(&tp->cookie_values->kref,
+			 tcp_cookie_values_release);
+		tp->cookie_values = NULL;
+	}
+
 	percpu_counter_dec(&tcp_sockets_allocated);
 }
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index d3f6bbfc76f0..96852af43ca7 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -383,14 +383,43 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		const struct inet_request_sock *ireq = inet_rsk(req);
 		struct tcp_request_sock *treq = tcp_rsk(req);
 		struct inet_connection_sock *newicsk = inet_csk(newsk);
-		struct tcp_sock *newtp;
+		struct tcp_sock *newtp = tcp_sk(newsk);
+		struct tcp_sock *oldtp = tcp_sk(sk);
+		struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
+
+		/* TCP Cookie Transactions require space for the cookie pair,
+		 * as it differs for each connection.  There is no need to
+		 * copy any s_data_payload stored at the original socket.
+		 * Failure will prevent resuming the connection.
+		 *
+		 * Presumed copied, in order of appearance:
+		 *	cookie_in_always, cookie_out_never
+		 */
+		if (oldcvp != NULL) {
+			struct tcp_cookie_values *newcvp =
+				kzalloc(sizeof(*newtp->cookie_values),
+					GFP_ATOMIC);
+
+			if (newcvp != NULL) {
+				kref_init(&newcvp->kref);
+				newcvp->cookie_desired =
+						oldcvp->cookie_desired;
+				newtp->cookie_values = newcvp;
+			} else {
+				/* Not Yet Implemented */
+				newtp->cookie_values = NULL;
+			}
+		}
 
 		/* Now setup tcp_sock */
-		newtp = tcp_sk(newsk);
 		newtp->pred_flags = 0;
-		newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1;
-		newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1;
-		newtp->snd_up = treq->snt_isn + 1;
+
+		newtp->rcv_wup = newtp->copied_seq =
+		newtp->rcv_nxt = treq->rcv_isn + 1;
+
+		newtp->snd_sml = newtp->snd_una =
+		newtp->snd_nxt = newtp->snd_up =
+			treq->snt_isn + 1 + tcp_s_data_size(oldtp);
 
 		tcp_prequeue_init(newtp);
 
@@ -423,8 +452,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		tcp_set_ca_state(newsk, TCP_CA_Open);
 		tcp_init_xmit_timers(newsk);
 		skb_queue_head_init(&newtp->out_of_order_queue);
-		newtp->write_seq = treq->snt_isn + 1;
-		newtp->pushed_seq = newtp->write_seq;
+		newtp->write_seq = newtp->pushed_seq =
+			treq->snt_isn + 1 + tcp_s_data_size(oldtp);
 
 		newtp->rx_opt.saw_tstamp = 0;
 
@@ -590,7 +619,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 	 * Invalid ACK: reset will be sent by listening socket
 	 */
 	if ((flg & TCP_FLAG_ACK) &&
-	    (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1))
+	    (TCP_SKB_CB(skb)->ack_seq !=
+	     tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk))))
 		return sk;
 
 	/* Also, it would be not so bad idea to check rcv_tsecr, which
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index da6e24416d75..f2ec38289a4a 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1864,6 +1864,19 @@ static int tcp_v6_init_sock(struct sock *sk)
 	tp->af_specific = &tcp_sock_ipv6_specific;
 #endif
 
+	/* TCP Cookie Transactions */
+	if (sysctl_tcp_cookie_size > 0) {
+		/* Default, cookies without s_data_payload. */
+		tp->cookie_values =
+			kzalloc(sizeof(*tp->cookie_values),
+				sk->sk_allocation);
+		if (tp->cookie_values != NULL)
+			kref_init(&tp->cookie_values->kref);
+	}
+	/* Presumed zeroed, in order of appearance:
+	 *	cookie_in_always, cookie_out_never,
+	 *	s_data_constant, s_data_in, s_data_out
+	 */
 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
 
-- 
cgit v1.2.3


From 7cff7ce94a7df2ccf5ac76b48ee0995fee2060df Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 9 Oct 2009 00:01:39 -0700
Subject: include/linux/compiler-gcc4.h: Fix build bug - gcc-4.0.2 doesn't
 understand __builtin_object_size

Maybe 4.1.0 doesn't too, but this fixed it for me.

Caused by:

 4a31276: x86: Turn the copy_from_user check into an (optional) compile time warning
 9f0cf4a: x86: Use __builtin_object_size() to validate the buffer size for copy_from_user()

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
LKML-Reference: <200910090724.n997OQl6013538@imap1.linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/compiler-gcc4.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
index 77542c57e20a..e6ef279ca20c 100644
--- a/include/linux/compiler-gcc4.h
+++ b/include/linux/compiler-gcc4.h
@@ -38,7 +38,9 @@
 
 #endif
 
+#if __GNUC_MINOR__ > 0
 #define __compiletime_object_size(obj) __builtin_object_size(obj, 0)
+#endif
 #if __GNUC_MINOR__ >= 4
 #define __compiletime_warning(message) __attribute__((warning(message)))
 #define __compiletime_error(message) __attribute__((error(message)))
-- 
cgit v1.2.3


From 1a6e4a8c276e122dbeb6f9c610f29735e4236bfd Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 24 Aug 2009 11:54:19 +0300
Subject: KVM: Move irq sharing information to irqchip level

This removes assumptions that max GSIs is smaller than number of pins.
Sharing is tracked on pin level not GSI level.

[avi: no PIC on ia64]

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/irq.h              |  1 +
 include/linux/kvm_host.h        |  2 +-
 virt/kvm/ioapic.h               |  1 +
 virt/kvm/irq_comm.c             | 59 +++++++++++++++++++++++++----------------
 5 files changed, 39 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0b113f2b58cf..35d3236c9de4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -410,7 +410,6 @@ struct kvm_arch{
 	gpa_t ept_identity_map_addr;
 
 	unsigned long irq_sources_bitmap;
-	unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
 	u64 vm_init_tsc;
 };
 
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 7d6058a2fd38..c025a2362aae 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -71,6 +71,7 @@ struct kvm_pic {
 	int output;		/* intr from master PIC */
 	struct kvm_io_device dev;
 	void (*ack_notifier)(void *opaque, int irq);
+	unsigned long irq_states[16];
 };
 
 struct kvm_pic *kvm_create_pic(struct kvm *kvm);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b7bbb5ddd7ae..1c7f8c49e4e8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -120,7 +120,7 @@ struct kvm_kernel_irq_routing_entry {
 	u32 gsi;
 	u32 type;
 	int (*set)(struct kvm_kernel_irq_routing_entry *e,
-		    struct kvm *kvm, int level);
+		   struct kvm *kvm, int irq_source_id, int level);
 	union {
 		struct {
 			unsigned irqchip;
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 7080b713c160..6e461ade6365 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -41,6 +41,7 @@ struct kvm_ioapic {
 	u32 irr;
 	u32 pad;
 	union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS];
+	unsigned long irq_states[IOAPIC_NUM_PINS];
 	struct kvm_io_device dev;
 	struct kvm *kvm;
 	void (*ack_notifier)(void *opaque, int irq);
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 001663ff401a..9783f5c43dae 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -31,20 +31,39 @@
 
 #include "ioapic.h"
 
+static inline int kvm_irq_line_state(unsigned long *irq_state,
+				     int irq_source_id, int level)
+{
+	/* Logical OR for level trig interrupt */
+	if (level)
+		set_bit(irq_source_id, irq_state);
+	else
+		clear_bit(irq_source_id, irq_state);
+
+	return !!(*irq_state);
+}
+
 static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
-			   struct kvm *kvm, int level)
+			   struct kvm *kvm, int irq_source_id, int level)
 {
 #ifdef CONFIG_X86
-	return kvm_pic_set_irq(pic_irqchip(kvm), e->irqchip.pin, level);
+	struct kvm_pic *pic = pic_irqchip(kvm);
+	level = kvm_irq_line_state(&pic->irq_states[e->irqchip.pin],
+				   irq_source_id, level);
+	return kvm_pic_set_irq(pic, e->irqchip.pin, level);
 #else
 	return -1;
 #endif
 }
 
 static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
-			      struct kvm *kvm, int level)
+			      struct kvm *kvm, int irq_source_id, int level)
 {
-	return kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level);
+	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+	level = kvm_irq_line_state(&ioapic->irq_states[e->irqchip.pin],
+				   irq_source_id, level);
+
+	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, level);
 }
 
 inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
@@ -96,10 +115,13 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 }
 
 static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
-		       struct kvm *kvm, int level)
+		       struct kvm *kvm, int irq_source_id, int level)
 {
 	struct kvm_lapic_irq irq;
 
+	if (!level)
+		return -1;
+
 	trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
 
 	irq.dest_id = (e->msi.address_lo &
@@ -125,34 +147,19 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level)
 {
 	struct kvm_kernel_irq_routing_entry *e;
-	unsigned long *irq_state, sig_level;
 	int ret = -1;
 
 	trace_kvm_set_irq(irq, level, irq_source_id);
 
 	WARN_ON(!mutex_is_locked(&kvm->irq_lock));
 
-	if (irq < KVM_IOAPIC_NUM_PINS) {
-		irq_state = (unsigned long *)&kvm->arch.irq_states[irq];
-
-		/* Logical OR for level trig interrupt */
-		if (level)
-			set_bit(irq_source_id, irq_state);
-		else
-			clear_bit(irq_source_id, irq_state);
-		sig_level = !!(*irq_state);
-	} else if (!level)
-		return ret;
-	else /* Deal with MSI/MSI-X */
-		sig_level = 1;
-
 	/* Not possible to detect if the guest uses the PIC or the
 	 * IOAPIC.  So set the bit in both. The guest will ignore
 	 * writes to the unused one.
 	 */
 	list_for_each_entry(e, &kvm->irq_routing, link)
 		if (e->gsi == irq) {
-			int r = e->set(e, kvm, sig_level);
+			int r = e->set(e, kvm, irq_source_id, level);
 			if (r < 0)
 				continue;
 
@@ -232,8 +239,14 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
 		printk(KERN_ERR "kvm: IRQ source ID out of range!\n");
 		return;
 	}
-	for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++)
-		clear_bit(irq_source_id, &kvm->arch.irq_states[i]);
+	for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) {
+		clear_bit(irq_source_id, &kvm->arch.vioapic->irq_states[i]);
+		if (i >= 16)
+			continue;
+#ifdef CONFIG_X86
+		clear_bit(irq_source_id, &pic_irqchip(kvm)->irq_states[i]);
+#endif
+	}
 	clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
 	mutex_unlock(&kvm->irq_lock);
 }
-- 
cgit v1.2.3


From 46e624b95c36d729bdf24010fff11d16f6fe94fa Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 24 Aug 2009 11:54:20 +0300
Subject: KVM: Change irq routing table to use gsi indexed array

Use gsi indexed array instead of scanning all entries on each interrupt
injection.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h | 21 ++++++++++--
 virt/kvm/irq_comm.c      | 88 +++++++++++++++++++++++++++++-------------------
 virt/kvm/kvm_main.c      |  1 -
 3 files changed, 71 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1c7f8c49e4e8..f403e66557fb 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -128,7 +128,17 @@ struct kvm_kernel_irq_routing_entry {
 		} irqchip;
 		struct msi_msg msi;
 	};
-	struct list_head link;
+	struct hlist_node link;
+};
+
+struct kvm_irq_routing_table {
+	struct kvm_kernel_irq_routing_entry *rt_entries;
+	u32 nr_rt_entries;
+	/*
+	 * Array indexed by gsi. Each entry contains list of irq chips
+	 * the gsi is connected to.
+	 */
+	struct hlist_head map[0];
 };
 
 struct kvm {
@@ -166,7 +176,7 @@ struct kvm {
 
 	struct mutex irq_lock;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
-	struct list_head irq_routing; /* of kvm_kernel_irq_routing_entry */
+	struct kvm_irq_routing_table *irq_routing;
 	struct hlist_head mask_notifier_list;
 #endif
 
@@ -390,7 +400,12 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 				      struct kvm_irq_mask_notifier *kimn);
 void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
 
-int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
+#ifdef __KVM_HAVE_IOAPIC
+void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
+				   union kvm_ioapic_redirect_entry *entry,
+				   unsigned long *deliver_bitmask);
+#endif
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
 				   struct kvm_irq_ack_notifier *kian);
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 9783f5c43dae..81950f6f6fd9 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -144,10 +144,12 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
  *  = 0   Interrupt was coalesced (previous irq is still pending)
  *  > 0   Number of CPUs interrupt was delivered to
  */
-int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level)
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
 {
 	struct kvm_kernel_irq_routing_entry *e;
 	int ret = -1;
+	struct kvm_irq_routing_table *irq_rt;
+	struct hlist_node *n;
 
 	trace_kvm_set_irq(irq, level, irq_source_id);
 
@@ -157,8 +159,9 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level)
 	 * IOAPIC.  So set the bit in both. The guest will ignore
 	 * writes to the unused one.
 	 */
-	list_for_each_entry(e, &kvm->irq_routing, link)
-		if (e->gsi == irq) {
+	irq_rt = kvm->irq_routing;
+	if (irq < irq_rt->nr_rt_entries)
+		hlist_for_each_entry(e, n, &irq_rt->map[irq], link) {
 			int r = e->set(e, kvm, irq_source_id, level);
 			if (r < 0)
 				continue;
@@ -170,20 +173,23 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level)
 
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
 {
-	struct kvm_kernel_irq_routing_entry *e;
 	struct kvm_irq_ack_notifier *kian;
 	struct hlist_node *n;
 	unsigned gsi = pin;
+	int i;
 
 	trace_kvm_ack_irq(irqchip, pin);
 
-	list_for_each_entry(e, &kvm->irq_routing, link)
+	for (i = 0; i < kvm->irq_routing->nr_rt_entries; i++) {
+		struct kvm_kernel_irq_routing_entry *e;
+		e = &kvm->irq_routing->rt_entries[i];
 		if (e->type == KVM_IRQ_ROUTING_IRQCHIP &&
 		    e->irqchip.irqchip == irqchip &&
 		    e->irqchip.pin == pin) {
 			gsi = e->gsi;
 			break;
 		}
+	}
 
 	hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list, link)
 		if (kian->gsi == gsi)
@@ -280,26 +286,30 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask)
 			kimn->func(kimn, mask);
 }
 
-static void __kvm_free_irq_routing(struct list_head *irq_routing)
-{
-	struct kvm_kernel_irq_routing_entry *e, *n;
-
-	list_for_each_entry_safe(e, n, irq_routing, link)
-		kfree(e);
-}
-
 void kvm_free_irq_routing(struct kvm *kvm)
 {
 	mutex_lock(&kvm->irq_lock);
-	__kvm_free_irq_routing(&kvm->irq_routing);
+	kfree(kvm->irq_routing);
 	mutex_unlock(&kvm->irq_lock);
 }
 
-static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+static int setup_routing_entry(struct kvm_irq_routing_table *rt,
+			       struct kvm_kernel_irq_routing_entry *e,
 			       const struct kvm_irq_routing_entry *ue)
 {
 	int r = -EINVAL;
 	int delta;
+	struct kvm_kernel_irq_routing_entry *ei;
+	struct hlist_node *n;
+
+	/*
+	 * Do not allow GSI to be mapped to the same irqchip more than once.
+	 * Allow only one to one mapping between GSI and MSI.
+	 */
+	hlist_for_each_entry(ei, n, &rt->map[ue->gsi], link)
+		if (ei->type == KVM_IRQ_ROUTING_MSI ||
+		    ue->u.irqchip.irqchip == ei->irqchip.irqchip)
+			return r;
 
 	e->gsi = ue->gsi;
 	e->type = ue->type;
@@ -332,6 +342,8 @@ static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e,
 	default:
 		goto out;
 	}
+
+	hlist_add_head(&e->link, &rt->map[e->gsi]);
 	r = 0;
 out:
 	return r;
@@ -343,43 +355,49 @@ int kvm_set_irq_routing(struct kvm *kvm,
 			unsigned nr,
 			unsigned flags)
 {
-	struct list_head irq_list = LIST_HEAD_INIT(irq_list);
-	struct list_head tmp = LIST_HEAD_INIT(tmp);
-	struct kvm_kernel_irq_routing_entry *e = NULL;
-	unsigned i;
+	struct kvm_irq_routing_table *new, *old;
+	u32 i, nr_rt_entries = 0;
 	int r;
 
+	for (i = 0; i < nr; ++i) {
+		if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES)
+			return -EINVAL;
+		nr_rt_entries = max(nr_rt_entries, ue[i].gsi);
+	}
+
+	nr_rt_entries += 1;
+
+	new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head))
+		      + (nr * sizeof(struct kvm_kernel_irq_routing_entry)),
+		      GFP_KERNEL);
+
+	if (!new)
+		return -ENOMEM;
+
+	new->rt_entries = (void *)&new->map[nr_rt_entries];
+
+	new->nr_rt_entries = nr_rt_entries;
+
 	for (i = 0; i < nr; ++i) {
 		r = -EINVAL;
-		if (ue->gsi >= KVM_MAX_IRQ_ROUTES)
-			goto out;
 		if (ue->flags)
 			goto out;
-		r = -ENOMEM;
-		e = kzalloc(sizeof(*e), GFP_KERNEL);
-		if (!e)
-			goto out;
-		r = setup_routing_entry(e, ue);
+		r = setup_routing_entry(new, &new->rt_entries[i], ue);
 		if (r)
 			goto out;
 		++ue;
-		list_add(&e->link, &irq_list);
-		e = NULL;
 	}
 
 	mutex_lock(&kvm->irq_lock);
-	list_splice(&kvm->irq_routing, &tmp);
-	INIT_LIST_HEAD(&kvm->irq_routing);
-	list_splice(&irq_list, &kvm->irq_routing);
-	INIT_LIST_HEAD(&irq_list);
-	list_splice(&tmp, &irq_list);
+	old = kvm->irq_routing;
+	kvm->irq_routing = new;
 	mutex_unlock(&kvm->irq_lock);
 
+	new = old;
 	r = 0;
 
 out:
-	kfree(e);
-	__kvm_free_irq_routing(&irq_list);
+	kfree(new);
 	return r;
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 22b520b54411..3bee94892774 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -957,7 +957,6 @@ static struct kvm *kvm_create_vm(void)
 	if (IS_ERR(kvm))
 		goto out;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
-	INIT_LIST_HEAD(&kvm->irq_routing);
 	INIT_HLIST_HEAD(&kvm->mask_notifier_list);
 #endif
 
-- 
cgit v1.2.3


From 3e71f88bc90792a187703860cf22fbed7c12cbd9 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 24 Aug 2009 11:54:21 +0300
Subject: KVM: Maintain back mapping from irqchip/pin to gsi

Maintain back mapping from irqchip/pin to gsi to speedup
interrupt acknowledgment notifications.

[avi: build fix on non-x86/ia64]

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/include/asm/kvm.h |  1 +
 arch/x86/include/asm/kvm.h  |  1 +
 include/linux/kvm_host.h    |  9 +++++++++
 virt/kvm/irq_comm.c         | 31 ++++++++++++++-----------------
 4 files changed, 25 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h
index 18a7e49abbc5..bc90c75adf67 100644
--- a/arch/ia64/include/asm/kvm.h
+++ b/arch/ia64/include/asm/kvm.h
@@ -60,6 +60,7 @@ struct kvm_ioapic_state {
 #define KVM_IRQCHIP_PIC_MASTER   0
 #define KVM_IRQCHIP_PIC_SLAVE    1
 #define KVM_IRQCHIP_IOAPIC       2
+#define KVM_NR_IRQCHIPS          3
 
 #define KVM_CONTEXT_SIZE	8*1024
 
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 4a5fe914dc59..f02e87a5206f 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -79,6 +79,7 @@ struct kvm_ioapic_state {
 #define KVM_IRQCHIP_PIC_MASTER   0
 #define KVM_IRQCHIP_PIC_SLAVE    1
 #define KVM_IRQCHIP_IOAPIC       2
+#define KVM_NR_IRQCHIPS          3
 
 /* for KVM_GET_REGS and KVM_SET_REGS */
 struct kvm_regs {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f403e66557fb..cc2d7493598b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -131,7 +131,10 @@ struct kvm_kernel_irq_routing_entry {
 	struct hlist_node link;
 };
 
+#ifdef __KVM_HAVE_IOAPIC
+
 struct kvm_irq_routing_table {
+	int chip[KVM_NR_IRQCHIPS][KVM_IOAPIC_NUM_PINS];
 	struct kvm_kernel_irq_routing_entry *rt_entries;
 	u32 nr_rt_entries;
 	/*
@@ -141,6 +144,12 @@ struct kvm_irq_routing_table {
 	struct hlist_head map[0];
 };
 
+#else
+
+struct kvm_irq_routing_table {};
+
+#endif
+
 struct kvm {
 	spinlock_t mmu_lock;
 	spinlock_t requests_lock;
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 81950f6f6fd9..59cf8dae0062 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -175,25 +175,16 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
 {
 	struct kvm_irq_ack_notifier *kian;
 	struct hlist_node *n;
-	unsigned gsi = pin;
-	int i;
+	int gsi;
 
 	trace_kvm_ack_irq(irqchip, pin);
 
-	for (i = 0; i < kvm->irq_routing->nr_rt_entries; i++) {
-		struct kvm_kernel_irq_routing_entry *e;
-		e = &kvm->irq_routing->rt_entries[i];
-		if (e->type == KVM_IRQ_ROUTING_IRQCHIP &&
-		    e->irqchip.irqchip == irqchip &&
-		    e->irqchip.pin == pin) {
-			gsi = e->gsi;
-			break;
-		}
-	}
-
-	hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list, link)
-		if (kian->gsi == gsi)
-			kian->irq_acked(kian);
+	gsi = kvm->irq_routing->chip[irqchip][pin];
+	if (gsi != -1)
+		hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list,
+				     link)
+			if (kian->gsi == gsi)
+				kian->irq_acked(kian);
 }
 
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
@@ -332,6 +323,9 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
 		}
 		e->irqchip.irqchip = ue->u.irqchip.irqchip;
 		e->irqchip.pin = ue->u.irqchip.pin + delta;
+		if (e->irqchip.pin >= KVM_IOAPIC_NUM_PINS)
+			goto out;
+		rt->chip[ue->u.irqchip.irqchip][e->irqchip.pin] = ue->gsi;
 		break;
 	case KVM_IRQ_ROUTING_MSI:
 		e->set = kvm_set_msi;
@@ -356,7 +350,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
 			unsigned flags)
 {
 	struct kvm_irq_routing_table *new, *old;
-	u32 i, nr_rt_entries = 0;
+	u32 i, j, nr_rt_entries = 0;
 	int r;
 
 	for (i = 0; i < nr; ++i) {
@@ -377,6 +371,9 @@ int kvm_set_irq_routing(struct kvm *kvm,
 	new->rt_entries = (void *)&new->map[nr_rt_entries];
 
 	new->nr_rt_entries = nr_rt_entries;
+	for (i = 0; i < 3; i++)
+		for (j = 0; j < KVM_IOAPIC_NUM_PINS; j++)
+			new->chip[i][j] = -1;
 
 	for (i = 0; i < nr; ++i) {
 		r = -EINVAL;
-- 
cgit v1.2.3


From 136bdfeee7b5bc986fc94af3a40d7d13ea37bb95 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 24 Aug 2009 11:54:23 +0300
Subject: KVM: Move irq ack notifier list to arch independent code

Mask irq notifier list is already there.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/include/asm/kvm_host.h | 1 -
 arch/x86/include/asm/kvm_host.h  | 1 -
 include/linux/kvm_host.h         | 1 +
 virt/kvm/irq_comm.c              | 5 ++---
 virt/kvm/kvm_main.c              | 1 +
 5 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index d9b6325a9328..a362e67e0ca6 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -475,7 +475,6 @@ struct kvm_arch {
 	struct list_head assigned_dev_head;
 	struct iommu_domain *iommu_domain;
 	int iommu_flags;
-	struct hlist_head irq_ack_notifier_list;
 
 	unsigned long irq_sources_bitmap;
 	unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 35d3236c9de4..a46e2dd9aca8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -397,7 +397,6 @@ struct kvm_arch{
 	struct kvm_pic *vpic;
 	struct kvm_ioapic *vioapic;
 	struct kvm_pit *vpit;
-	struct hlist_head irq_ack_notifier_list;
 	int vapics_in_nmi_mode;
 
 	unsigned int tss_addr;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index cc2d7493598b..4aa5e1d9a797 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -187,6 +187,7 @@ struct kvm {
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
 	struct kvm_irq_routing_table *irq_routing;
 	struct hlist_head mask_notifier_list;
+	struct hlist_head irq_ack_notifier_list;
 #endif
 
 #ifdef KVM_ARCH_WANT_MMU_NOTIFIER
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index fb861dd956fc..f01972595938 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -186,8 +186,7 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
 	rcu_read_unlock();
 
 	if (gsi != -1)
-		hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list,
-				     link)
+		hlist_for_each_entry(kian, n, &kvm->irq_ack_notifier_list, link)
 			if (kian->gsi == gsi)
 				kian->irq_acked(kian);
 }
@@ -196,7 +195,7 @@ void kvm_register_irq_ack_notifier(struct kvm *kvm,
 				   struct kvm_irq_ack_notifier *kian)
 {
 	mutex_lock(&kvm->irq_lock);
-	hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list);
+	hlist_add_head(&kian->link, &kvm->irq_ack_notifier_list);
 	mutex_unlock(&kvm->irq_lock);
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 3bee94892774..6eca153e1a02 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -958,6 +958,7 @@ static struct kvm *kvm_create_vm(void)
 		goto out;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
 	INIT_HLIST_HEAD(&kvm->mask_notifier_list);
+	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
 #endif
 
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
-- 
cgit v1.2.3


From bfd99ff5d483b11c32bca49fbff7a5ac59038b0a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 26 Aug 2009 14:57:50 +0300
Subject: KVM: Move assigned device code to own file

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/Makefile   |   2 +-
 arch/x86/kvm/Makefile    |   3 +-
 include/linux/kvm_host.h |  17 +
 virt/kvm/assigned-dev.c  | 818 +++++++++++++++++++++++++++++++++++++++++++++++
 virt/kvm/kvm_main.c      | 798 +--------------------------------------------
 5 files changed, 840 insertions(+), 798 deletions(-)
 create mode 100644 virt/kvm/assigned-dev.c

(limited to 'include/linux')

diff --git a/arch/ia64/kvm/Makefile b/arch/ia64/kvm/Makefile
index 0bb99b732908..1089b3e918ac 100644
--- a/arch/ia64/kvm/Makefile
+++ b/arch/ia64/kvm/Makefile
@@ -49,7 +49,7 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/
 EXTRA_AFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/
 
 common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
-		coalesced_mmio.o irq_comm.o)
+		coalesced_mmio.o irq_comm.o assigned-dev.o)
 
 ifeq ($(CONFIG_IOMMU_API),y)
 common-objs += $(addprefix ../../../virt/kvm/, iommu.o)
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 0e7fe78d0f74..31a7035c4bd9 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -6,7 +6,8 @@ CFLAGS_svm.o := -I.
 CFLAGS_vmx.o := -I.
 
 kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
-				coalesced_mmio.o irq_comm.o eventfd.o)
+				coalesced_mmio.o irq_comm.o eventfd.o \
+				assigned-dev.o)
 kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
 
 kvm-y			+= x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4aa5e1d9a797..c0a1cc35f080 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -577,4 +577,21 @@ static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
 	return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id;
 }
 #endif
+
+#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
+
+long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
+				  unsigned long arg);
+
+#else
+
+static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
+						unsigned long arg)
+{
+	return -ENOTTY;
+}
+
 #endif
+
+#endif
+
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
new file mode 100644
index 000000000000..fd9c097b760a
--- /dev/null
+++ b/virt/kvm/assigned-dev.c
@@ -0,0 +1,818 @@
+/*
+ * Kernel-based Virtual Machine - device assignment support
+ *
+ * Copyright (C) 2006-9 Red Hat, Inc
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include "irq.h"
+
+static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
+						      int assigned_dev_id)
+{
+	struct list_head *ptr;
+	struct kvm_assigned_dev_kernel *match;
+
+	list_for_each(ptr, head) {
+		match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
+		if (match->assigned_dev_id == assigned_dev_id)
+			return match;
+	}
+	return NULL;
+}
+
+static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
+				    *assigned_dev, int irq)
+{
+	int i, index;
+	struct msix_entry *host_msix_entries;
+
+	host_msix_entries = assigned_dev->host_msix_entries;
+
+	index = -1;
+	for (i = 0; i < assigned_dev->entries_nr; i++)
+		if (irq == host_msix_entries[i].vector) {
+			index = i;
+			break;
+		}
+	if (index < 0) {
+		printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
+		return 0;
+	}
+
+	return index;
+}
+
+static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
+{
+	struct kvm_assigned_dev_kernel *assigned_dev;
+	struct kvm *kvm;
+	int i;
+
+	assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
+				    interrupt_work);
+	kvm = assigned_dev->kvm;
+
+	spin_lock_irq(&assigned_dev->assigned_dev_lock);
+	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
+		struct kvm_guest_msix_entry *guest_entries =
+			assigned_dev->guest_msix_entries;
+		for (i = 0; i < assigned_dev->entries_nr; i++) {
+			if (!(guest_entries[i].flags &
+					KVM_ASSIGNED_MSIX_PENDING))
+				continue;
+			guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING;
+			kvm_set_irq(assigned_dev->kvm,
+				    assigned_dev->irq_source_id,
+				    guest_entries[i].vector, 1);
+		}
+	} else
+		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+			    assigned_dev->guest_irq, 1);
+
+	spin_unlock_irq(&assigned_dev->assigned_dev_lock);
+}
+
+static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
+{
+	unsigned long flags;
+	struct kvm_assigned_dev_kernel *assigned_dev =
+		(struct kvm_assigned_dev_kernel *) dev_id;
+
+	spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags);
+	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
+		int index = find_index_from_host_irq(assigned_dev, irq);
+		if (index < 0)
+			goto out;
+		assigned_dev->guest_msix_entries[index].flags |=
+			KVM_ASSIGNED_MSIX_PENDING;
+	}
+
+	schedule_work(&assigned_dev->interrupt_work);
+
+	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
+		disable_irq_nosync(irq);
+		assigned_dev->host_irq_disabled = true;
+	}
+
+out:
+	spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags);
+	return IRQ_HANDLED;
+}
+
+/* Ack the irq line for an assigned device */
+static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
+{
+	struct kvm_assigned_dev_kernel *dev;
+	unsigned long flags;
+
+	if (kian->gsi == -1)
+		return;
+
+	dev = container_of(kian, struct kvm_assigned_dev_kernel,
+			   ack_notifier);
+
+	kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
+
+	/* The guest irq may be shared so this ack may be
+	 * from another device.
+	 */
+	spin_lock_irqsave(&dev->assigned_dev_lock, flags);
+	if (dev->host_irq_disabled) {
+		enable_irq(dev->host_irq);
+		dev->host_irq_disabled = false;
+	}
+	spin_unlock_irqrestore(&dev->assigned_dev_lock, flags);
+}
+
+static void deassign_guest_irq(struct kvm *kvm,
+			       struct kvm_assigned_dev_kernel *assigned_dev)
+{
+	kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
+	assigned_dev->ack_notifier.gsi = -1;
+
+	if (assigned_dev->irq_source_id != -1)
+		kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
+	assigned_dev->irq_source_id = -1;
+	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
+}
+
+/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
+static void deassign_host_irq(struct kvm *kvm,
+			      struct kvm_assigned_dev_kernel *assigned_dev)
+{
+	/*
+	 * In kvm_free_device_irq, cancel_work_sync return true if:
+	 * 1. work is scheduled, and then cancelled.
+	 * 2. work callback is executed.
+	 *
+	 * The first one ensured that the irq is disabled and no more events
+	 * would happen. But for the second one, the irq may be enabled (e.g.
+	 * for MSI). So we disable irq here to prevent further events.
+	 *
+	 * Notice this maybe result in nested disable if the interrupt type is
+	 * INTx, but it's OK for we are going to free it.
+	 *
+	 * If this function is a part of VM destroy, please ensure that till
+	 * now, the kvm state is still legal for probably we also have to wait
+	 * interrupt_work done.
+	 */
+	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
+		int i;
+		for (i = 0; i < assigned_dev->entries_nr; i++)
+			disable_irq_nosync(assigned_dev->
+					   host_msix_entries[i].vector);
+
+		cancel_work_sync(&assigned_dev->interrupt_work);
+
+		for (i = 0; i < assigned_dev->entries_nr; i++)
+			free_irq(assigned_dev->host_msix_entries[i].vector,
+				 (void *)assigned_dev);
+
+		assigned_dev->entries_nr = 0;
+		kfree(assigned_dev->host_msix_entries);
+		kfree(assigned_dev->guest_msix_entries);
+		pci_disable_msix(assigned_dev->dev);
+	} else {
+		/* Deal with MSI and INTx */
+		disable_irq_nosync(assigned_dev->host_irq);
+		cancel_work_sync(&assigned_dev->interrupt_work);
+
+		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+
+		if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
+			pci_disable_msi(assigned_dev->dev);
+	}
+
+	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
+}
+
+static int kvm_deassign_irq(struct kvm *kvm,
+			    struct kvm_assigned_dev_kernel *assigned_dev,
+			    unsigned long irq_requested_type)
+{
+	unsigned long guest_irq_type, host_irq_type;
+
+	if (!irqchip_in_kernel(kvm))
+		return -EINVAL;
+	/* no irq assignment to deassign */
+	if (!assigned_dev->irq_requested_type)
+		return -ENXIO;
+
+	host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
+	guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
+
+	if (host_irq_type)
+		deassign_host_irq(kvm, assigned_dev);
+	if (guest_irq_type)
+		deassign_guest_irq(kvm, assigned_dev);
+
+	return 0;
+}
+
+static void kvm_free_assigned_irq(struct kvm *kvm,
+				  struct kvm_assigned_dev_kernel *assigned_dev)
+{
+	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
+}
+
+static void kvm_free_assigned_device(struct kvm *kvm,
+				     struct kvm_assigned_dev_kernel
+				     *assigned_dev)
+{
+	kvm_free_assigned_irq(kvm, assigned_dev);
+
+	pci_reset_function(assigned_dev->dev);
+
+	pci_release_regions(assigned_dev->dev);
+	pci_disable_device(assigned_dev->dev);
+	pci_dev_put(assigned_dev->dev);
+
+	list_del(&assigned_dev->list);
+	kfree(assigned_dev);
+}
+
+void kvm_free_all_assigned_devices(struct kvm *kvm)
+{
+	struct list_head *ptr, *ptr2;
+	struct kvm_assigned_dev_kernel *assigned_dev;
+
+	list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
+		assigned_dev = list_entry(ptr,
+					  struct kvm_assigned_dev_kernel,
+					  list);
+
+		kvm_free_assigned_device(kvm, assigned_dev);
+	}
+}
+
+static int assigned_device_enable_host_intx(struct kvm *kvm,
+					    struct kvm_assigned_dev_kernel *dev)
+{
+	dev->host_irq = dev->dev->irq;
+	/* Even though this is PCI, we don't want to use shared
+	 * interrupts. Sharing host devices with guest-assigned devices
+	 * on the same interrupt line is not a happy situation: there
+	 * are going to be long delays in accepting, acking, etc.
+	 */
+	if (request_irq(dev->host_irq, kvm_assigned_dev_intr,
+			0, "kvm_assigned_intx_device", (void *)dev))
+		return -EIO;
+	return 0;
+}
+
+#ifdef __KVM_HAVE_MSI
+static int assigned_device_enable_host_msi(struct kvm *kvm,
+					   struct kvm_assigned_dev_kernel *dev)
+{
+	int r;
+
+	if (!dev->dev->msi_enabled) {
+		r = pci_enable_msi(dev->dev);
+		if (r)
+			return r;
+	}
+
+	dev->host_irq = dev->dev->irq;
+	if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0,
+			"kvm_assigned_msi_device", (void *)dev)) {
+		pci_disable_msi(dev->dev);
+		return -EIO;
+	}
+
+	return 0;
+}
+#endif
+
+#ifdef __KVM_HAVE_MSIX
+static int assigned_device_enable_host_msix(struct kvm *kvm,
+					    struct kvm_assigned_dev_kernel *dev)
+{
+	int i, r = -EINVAL;
+
+	/* host_msix_entries and guest_msix_entries should have been
+	 * initialized */
+	if (dev->entries_nr == 0)
+		return r;
+
+	r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr);
+	if (r)
+		return r;
+
+	for (i = 0; i < dev->entries_nr; i++) {
+		r = request_irq(dev->host_msix_entries[i].vector,
+				kvm_assigned_dev_intr, 0,
+				"kvm_assigned_msix_device",
+				(void *)dev);
+		/* FIXME: free requested_irq's on failure */
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+
+#endif
+
+static int assigned_device_enable_guest_intx(struct kvm *kvm,
+				struct kvm_assigned_dev_kernel *dev,
+				struct kvm_assigned_irq *irq)
+{
+	dev->guest_irq = irq->guest_irq;
+	dev->ack_notifier.gsi = irq->guest_irq;
+	return 0;
+}
+
+#ifdef __KVM_HAVE_MSI
+static int assigned_device_enable_guest_msi(struct kvm *kvm,
+			struct kvm_assigned_dev_kernel *dev,
+			struct kvm_assigned_irq *irq)
+{
+	dev->guest_irq = irq->guest_irq;
+	dev->ack_notifier.gsi = -1;
+	dev->host_irq_disabled = false;
+	return 0;
+}
+#endif
+
+#ifdef __KVM_HAVE_MSIX
+static int assigned_device_enable_guest_msix(struct kvm *kvm,
+			struct kvm_assigned_dev_kernel *dev,
+			struct kvm_assigned_irq *irq)
+{
+	dev->guest_irq = irq->guest_irq;
+	dev->ack_notifier.gsi = -1;
+	dev->host_irq_disabled = false;
+	return 0;
+}
+#endif
+
+static int assign_host_irq(struct kvm *kvm,
+			   struct kvm_assigned_dev_kernel *dev,
+			   __u32 host_irq_type)
+{
+	int r = -EEXIST;
+
+	if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
+		return r;
+
+	switch (host_irq_type) {
+	case KVM_DEV_IRQ_HOST_INTX:
+		r = assigned_device_enable_host_intx(kvm, dev);
+		break;
+#ifdef __KVM_HAVE_MSI
+	case KVM_DEV_IRQ_HOST_MSI:
+		r = assigned_device_enable_host_msi(kvm, dev);
+		break;
+#endif
+#ifdef __KVM_HAVE_MSIX
+	case KVM_DEV_IRQ_HOST_MSIX:
+		r = assigned_device_enable_host_msix(kvm, dev);
+		break;
+#endif
+	default:
+		r = -EINVAL;
+	}
+
+	if (!r)
+		dev->irq_requested_type |= host_irq_type;
+
+	return r;
+}
+
+static int assign_guest_irq(struct kvm *kvm,
+			    struct kvm_assigned_dev_kernel *dev,
+			    struct kvm_assigned_irq *irq,
+			    unsigned long guest_irq_type)
+{
+	int id;
+	int r = -EEXIST;
+
+	if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
+		return r;
+
+	id = kvm_request_irq_source_id(kvm);
+	if (id < 0)
+		return id;
+
+	dev->irq_source_id = id;
+
+	switch (guest_irq_type) {
+	case KVM_DEV_IRQ_GUEST_INTX:
+		r = assigned_device_enable_guest_intx(kvm, dev, irq);
+		break;
+#ifdef __KVM_HAVE_MSI
+	case KVM_DEV_IRQ_GUEST_MSI:
+		r = assigned_device_enable_guest_msi(kvm, dev, irq);
+		break;
+#endif
+#ifdef __KVM_HAVE_MSIX
+	case KVM_DEV_IRQ_GUEST_MSIX:
+		r = assigned_device_enable_guest_msix(kvm, dev, irq);
+		break;
+#endif
+	default:
+		r = -EINVAL;
+	}
+
+	if (!r) {
+		dev->irq_requested_type |= guest_irq_type;
+		kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
+	} else
+		kvm_free_irq_source_id(kvm, dev->irq_source_id);
+
+	return r;
+}
+
+/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
+static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
+				   struct kvm_assigned_irq *assigned_irq)
+{
+	int r = -EINVAL;
+	struct kvm_assigned_dev_kernel *match;
+	unsigned long host_irq_type, guest_irq_type;
+
+	if (!capable(CAP_SYS_RAWIO))
+		return -EPERM;
+
+	if (!irqchip_in_kernel(kvm))
+		return r;
+
+	mutex_lock(&kvm->lock);
+	r = -ENODEV;
+	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_irq->assigned_dev_id);
+	if (!match)
+		goto out;
+
+	host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
+	guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
+
+	r = -EINVAL;
+	/* can only assign one type at a time */
+	if (hweight_long(host_irq_type) > 1)
+		goto out;
+	if (hweight_long(guest_irq_type) > 1)
+		goto out;
+	if (host_irq_type == 0 && guest_irq_type == 0)
+		goto out;
+
+	r = 0;
+	if (host_irq_type)
+		r = assign_host_irq(kvm, match, host_irq_type);
+	if (r)
+		goto out;
+
+	if (guest_irq_type)
+		r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
+out:
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
+static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
+					 struct kvm_assigned_irq
+					 *assigned_irq)
+{
+	int r = -ENODEV;
+	struct kvm_assigned_dev_kernel *match;
+
+	mutex_lock(&kvm->lock);
+
+	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_irq->assigned_dev_id);
+	if (!match)
+		goto out;
+
+	r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
+out:
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
+static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
+				      struct kvm_assigned_pci_dev *assigned_dev)
+{
+	int r = 0;
+	struct kvm_assigned_dev_kernel *match;
+	struct pci_dev *dev;
+
+	down_read(&kvm->slots_lock);
+	mutex_lock(&kvm->lock);
+
+	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_dev->assigned_dev_id);
+	if (match) {
+		/* device already assigned */
+		r = -EEXIST;
+		goto out;
+	}
+
+	match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
+	if (match == NULL) {
+		printk(KERN_INFO "%s: Couldn't allocate memory\n",
+		       __func__);
+		r = -ENOMEM;
+		goto out;
+	}
+	dev = pci_get_bus_and_slot(assigned_dev->busnr,
+				   assigned_dev->devfn);
+	if (!dev) {
+		printk(KERN_INFO "%s: host device not found\n", __func__);
+		r = -EINVAL;
+		goto out_free;
+	}
+	if (pci_enable_device(dev)) {
+		printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
+		r = -EBUSY;
+		goto out_put;
+	}
+	r = pci_request_regions(dev, "kvm_assigned_device");
+	if (r) {
+		printk(KERN_INFO "%s: Could not get access to device regions\n",
+		       __func__);
+		goto out_disable;
+	}
+
+	pci_reset_function(dev);
+
+	match->assigned_dev_id = assigned_dev->assigned_dev_id;
+	match->host_busnr = assigned_dev->busnr;
+	match->host_devfn = assigned_dev->devfn;
+	match->flags = assigned_dev->flags;
+	match->dev = dev;
+	spin_lock_init(&match->assigned_dev_lock);
+	match->irq_source_id = -1;
+	match->kvm = kvm;
+	match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
+	INIT_WORK(&match->interrupt_work,
+		  kvm_assigned_dev_interrupt_work_handler);
+
+	list_add(&match->list, &kvm->arch.assigned_dev_head);
+
+	if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
+		if (!kvm->arch.iommu_domain) {
+			r = kvm_iommu_map_guest(kvm);
+			if (r)
+				goto out_list_del;
+		}
+		r = kvm_assign_device(kvm, match);
+		if (r)
+			goto out_list_del;
+	}
+
+out:
+	mutex_unlock(&kvm->lock);
+	up_read(&kvm->slots_lock);
+	return r;
+out_list_del:
+	list_del(&match->list);
+	pci_release_regions(dev);
+out_disable:
+	pci_disable_device(dev);
+out_put:
+	pci_dev_put(dev);
+out_free:
+	kfree(match);
+	mutex_unlock(&kvm->lock);
+	up_read(&kvm->slots_lock);
+	return r;
+}
+
+static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
+		struct kvm_assigned_pci_dev *assigned_dev)
+{
+	int r = 0;
+	struct kvm_assigned_dev_kernel *match;
+
+	mutex_lock(&kvm->lock);
+
+	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_dev->assigned_dev_id);
+	if (!match) {
+		printk(KERN_INFO "%s: device hasn't been assigned before, "
+		  "so cannot be deassigned\n", __func__);
+		r = -EINVAL;
+		goto out;
+	}
+
+	if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)
+		kvm_deassign_device(kvm, match);
+
+	kvm_free_assigned_device(kvm, match);
+
+out:
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
+
+#ifdef __KVM_HAVE_MSIX
+static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
+				    struct kvm_assigned_msix_nr *entry_nr)
+{
+	int r = 0;
+	struct kvm_assigned_dev_kernel *adev;
+
+	mutex_lock(&kvm->lock);
+
+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      entry_nr->assigned_dev_id);
+	if (!adev) {
+		r = -EINVAL;
+		goto msix_nr_out;
+	}
+
+	if (adev->entries_nr == 0) {
+		adev->entries_nr = entry_nr->entry_nr;
+		if (adev->entries_nr == 0 ||
+		    adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) {
+			r = -EINVAL;
+			goto msix_nr_out;
+		}
+
+		adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
+						entry_nr->entry_nr,
+						GFP_KERNEL);
+		if (!adev->host_msix_entries) {
+			r = -ENOMEM;
+			goto msix_nr_out;
+		}
+		adev->guest_msix_entries = kzalloc(
+				sizeof(struct kvm_guest_msix_entry) *
+				entry_nr->entry_nr, GFP_KERNEL);
+		if (!adev->guest_msix_entries) {
+			kfree(adev->host_msix_entries);
+			r = -ENOMEM;
+			goto msix_nr_out;
+		}
+	} else /* Not allowed set MSI-X number twice */
+		r = -EINVAL;
+msix_nr_out:
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
+static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
+				       struct kvm_assigned_msix_entry *entry)
+{
+	int r = 0, i;
+	struct kvm_assigned_dev_kernel *adev;
+
+	mutex_lock(&kvm->lock);
+
+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      entry->assigned_dev_id);
+
+	if (!adev) {
+		r = -EINVAL;
+		goto msix_entry_out;
+	}
+
+	for (i = 0; i < adev->entries_nr; i++)
+		if (adev->guest_msix_entries[i].vector == 0 ||
+		    adev->guest_msix_entries[i].entry == entry->entry) {
+			adev->guest_msix_entries[i].entry = entry->entry;
+			adev->guest_msix_entries[i].vector = entry->gsi;
+			adev->host_msix_entries[i].entry = entry->entry;
+			break;
+		}
+	if (i == adev->entries_nr) {
+		r = -ENOSPC;
+		goto msix_entry_out;
+	}
+
+msix_entry_out:
+	mutex_unlock(&kvm->lock);
+
+	return r;
+}
+#endif
+
+long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
+				  unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+	int r = -ENOTTY;
+
+	switch (ioctl) {
+	case KVM_ASSIGN_PCI_DEVICE: {
+		struct kvm_assigned_pci_dev assigned_dev;
+
+		r = -EFAULT;
+		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+			goto out;
+		r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_ASSIGN_IRQ: {
+		r = -EOPNOTSUPP;
+		break;
+	}
+#ifdef KVM_CAP_ASSIGN_DEV_IRQ
+	case KVM_ASSIGN_DEV_IRQ: {
+		struct kvm_assigned_irq assigned_irq;
+
+		r = -EFAULT;
+		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
+			goto out;
+		r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_DEASSIGN_DEV_IRQ: {
+		struct kvm_assigned_irq assigned_irq;
+
+		r = -EFAULT;
+		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
+			goto out;
+		r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
+		if (r)
+			goto out;
+		break;
+	}
+#endif
+#ifdef KVM_CAP_DEVICE_DEASSIGNMENT
+	case KVM_DEASSIGN_PCI_DEVICE: {
+		struct kvm_assigned_pci_dev assigned_dev;
+
+		r = -EFAULT;
+		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+			goto out;
+		r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
+		if (r)
+			goto out;
+		break;
+	}
+#endif
+#ifdef KVM_CAP_IRQ_ROUTING
+	case KVM_SET_GSI_ROUTING: {
+		struct kvm_irq_routing routing;
+		struct kvm_irq_routing __user *urouting;
+		struct kvm_irq_routing_entry *entries;
+
+		r = -EFAULT;
+		if (copy_from_user(&routing, argp, sizeof(routing)))
+			goto out;
+		r = -EINVAL;
+		if (routing.nr >= KVM_MAX_IRQ_ROUTES)
+			goto out;
+		if (routing.flags)
+			goto out;
+		r = -ENOMEM;
+		entries = vmalloc(routing.nr * sizeof(*entries));
+		if (!entries)
+			goto out;
+		r = -EFAULT;
+		urouting = argp;
+		if (copy_from_user(entries, urouting->entries,
+				   routing.nr * sizeof(*entries)))
+			goto out_free_irq_routing;
+		r = kvm_set_irq_routing(kvm, entries, routing.nr,
+					routing.flags);
+	out_free_irq_routing:
+		vfree(entries);
+		break;
+	}
+#endif /* KVM_CAP_IRQ_ROUTING */
+#ifdef __KVM_HAVE_MSIX
+	case KVM_ASSIGN_SET_MSIX_NR: {
+		struct kvm_assigned_msix_nr entry_nr;
+		r = -EFAULT;
+		if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
+			goto out;
+		r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_ASSIGN_SET_MSIX_ENTRY: {
+		struct kvm_assigned_msix_entry entry;
+		r = -EFAULT;
+		if (copy_from_user(&entry, argp, sizeof entry))
+			goto out;
+		r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
+		if (r)
+			goto out;
+		break;
+	}
+#endif
+	}
+out:
+	return r;
+}
+
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c12c95b1b641..38e4d2c34ac1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -53,12 +53,6 @@
 #include "coalesced_mmio.h"
 #endif
 
-#ifdef KVM_CAP_DEVICE_ASSIGNMENT
-#include <linux/pci.h>
-#include <linux/interrupt.h>
-#include "irq.h"
-#endif
-
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
 
@@ -90,608 +84,6 @@ static bool kvm_rebooting;
 
 static bool largepages_enabled = true;
 
-#ifdef KVM_CAP_DEVICE_ASSIGNMENT
-static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
-						      int assigned_dev_id)
-{
-	struct list_head *ptr;
-	struct kvm_assigned_dev_kernel *match;
-
-	list_for_each(ptr, head) {
-		match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
-		if (match->assigned_dev_id == assigned_dev_id)
-			return match;
-	}
-	return NULL;
-}
-
-static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
-				    *assigned_dev, int irq)
-{
-	int i, index;
-	struct msix_entry *host_msix_entries;
-
-	host_msix_entries = assigned_dev->host_msix_entries;
-
-	index = -1;
-	for (i = 0; i < assigned_dev->entries_nr; i++)
-		if (irq == host_msix_entries[i].vector) {
-			index = i;
-			break;
-		}
-	if (index < 0) {
-		printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
-		return 0;
-	}
-
-	return index;
-}
-
-static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev;
-	struct kvm *kvm;
-	int i;
-
-	assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
-				    interrupt_work);
-	kvm = assigned_dev->kvm;
-
-	spin_lock_irq(&assigned_dev->assigned_dev_lock);
-	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
-		struct kvm_guest_msix_entry *guest_entries =
-			assigned_dev->guest_msix_entries;
-		for (i = 0; i < assigned_dev->entries_nr; i++) {
-			if (!(guest_entries[i].flags &
-					KVM_ASSIGNED_MSIX_PENDING))
-				continue;
-			guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING;
-			kvm_set_irq(assigned_dev->kvm,
-				    assigned_dev->irq_source_id,
-				    guest_entries[i].vector, 1);
-		}
-	} else
-		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-			    assigned_dev->guest_irq, 1);
-
-	spin_unlock_irq(&assigned_dev->assigned_dev_lock);
-}
-
-static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
-{
-	unsigned long flags;
-	struct kvm_assigned_dev_kernel *assigned_dev =
-		(struct kvm_assigned_dev_kernel *) dev_id;
-
-	spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags);
-	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
-		int index = find_index_from_host_irq(assigned_dev, irq);
-		if (index < 0)
-			goto out;
-		assigned_dev->guest_msix_entries[index].flags |=
-			KVM_ASSIGNED_MSIX_PENDING;
-	}
-
-	schedule_work(&assigned_dev->interrupt_work);
-
-	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
-		disable_irq_nosync(irq);
-		assigned_dev->host_irq_disabled = true;
-	}
-
-out:
-	spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags);
-	return IRQ_HANDLED;
-}
-
-/* Ack the irq line for an assigned device */
-static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
-{
-	struct kvm_assigned_dev_kernel *dev;
-	unsigned long flags;
-
-	if (kian->gsi == -1)
-		return;
-
-	dev = container_of(kian, struct kvm_assigned_dev_kernel,
-			   ack_notifier);
-
-	kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
-
-	/* The guest irq may be shared so this ack may be
-	 * from another device.
-	 */
-	spin_lock_irqsave(&dev->assigned_dev_lock, flags);
-	if (dev->host_irq_disabled) {
-		enable_irq(dev->host_irq);
-		dev->host_irq_disabled = false;
-	}
-	spin_unlock_irqrestore(&dev->assigned_dev_lock, flags);
-}
-
-static void deassign_guest_irq(struct kvm *kvm,
-			       struct kvm_assigned_dev_kernel *assigned_dev)
-{
-	kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
-	assigned_dev->ack_notifier.gsi = -1;
-
-	if (assigned_dev->irq_source_id != -1)
-		kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
-	assigned_dev->irq_source_id = -1;
-	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
-}
-
-/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
-static void deassign_host_irq(struct kvm *kvm,
-			      struct kvm_assigned_dev_kernel *assigned_dev)
-{
-	/*
-	 * In kvm_free_device_irq, cancel_work_sync return true if:
-	 * 1. work is scheduled, and then cancelled.
-	 * 2. work callback is executed.
-	 *
-	 * The first one ensured that the irq is disabled and no more events
-	 * would happen. But for the second one, the irq may be enabled (e.g.
-	 * for MSI). So we disable irq here to prevent further events.
-	 *
-	 * Notice this maybe result in nested disable if the interrupt type is
-	 * INTx, but it's OK for we are going to free it.
-	 *
-	 * If this function is a part of VM destroy, please ensure that till
-	 * now, the kvm state is still legal for probably we also have to wait
-	 * interrupt_work done.
-	 */
-	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
-		int i;
-		for (i = 0; i < assigned_dev->entries_nr; i++)
-			disable_irq_nosync(assigned_dev->
-					   host_msix_entries[i].vector);
-
-		cancel_work_sync(&assigned_dev->interrupt_work);
-
-		for (i = 0; i < assigned_dev->entries_nr; i++)
-			free_irq(assigned_dev->host_msix_entries[i].vector,
-				 (void *)assigned_dev);
-
-		assigned_dev->entries_nr = 0;
-		kfree(assigned_dev->host_msix_entries);
-		kfree(assigned_dev->guest_msix_entries);
-		pci_disable_msix(assigned_dev->dev);
-	} else {
-		/* Deal with MSI and INTx */
-		disable_irq_nosync(assigned_dev->host_irq);
-		cancel_work_sync(&assigned_dev->interrupt_work);
-
-		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
-
-		if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
-			pci_disable_msi(assigned_dev->dev);
-	}
-
-	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
-}
-
-static int kvm_deassign_irq(struct kvm *kvm,
-			    struct kvm_assigned_dev_kernel *assigned_dev,
-			    unsigned long irq_requested_type)
-{
-	unsigned long guest_irq_type, host_irq_type;
-
-	if (!irqchip_in_kernel(kvm))
-		return -EINVAL;
-	/* no irq assignment to deassign */
-	if (!assigned_dev->irq_requested_type)
-		return -ENXIO;
-
-	host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
-	guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
-
-	if (host_irq_type)
-		deassign_host_irq(kvm, assigned_dev);
-	if (guest_irq_type)
-		deassign_guest_irq(kvm, assigned_dev);
-
-	return 0;
-}
-
-static void kvm_free_assigned_irq(struct kvm *kvm,
-				  struct kvm_assigned_dev_kernel *assigned_dev)
-{
-	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
-}
-
-static void kvm_free_assigned_device(struct kvm *kvm,
-				     struct kvm_assigned_dev_kernel
-				     *assigned_dev)
-{
-	kvm_free_assigned_irq(kvm, assigned_dev);
-
-	pci_reset_function(assigned_dev->dev);
-
-	pci_release_regions(assigned_dev->dev);
-	pci_disable_device(assigned_dev->dev);
-	pci_dev_put(assigned_dev->dev);
-
-	list_del(&assigned_dev->list);
-	kfree(assigned_dev);
-}
-
-void kvm_free_all_assigned_devices(struct kvm *kvm)
-{
-	struct list_head *ptr, *ptr2;
-	struct kvm_assigned_dev_kernel *assigned_dev;
-
-	list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
-		assigned_dev = list_entry(ptr,
-					  struct kvm_assigned_dev_kernel,
-					  list);
-
-		kvm_free_assigned_device(kvm, assigned_dev);
-	}
-}
-
-static int assigned_device_enable_host_intx(struct kvm *kvm,
-					    struct kvm_assigned_dev_kernel *dev)
-{
-	dev->host_irq = dev->dev->irq;
-	/* Even though this is PCI, we don't want to use shared
-	 * interrupts. Sharing host devices with guest-assigned devices
-	 * on the same interrupt line is not a happy situation: there
-	 * are going to be long delays in accepting, acking, etc.
-	 */
-	if (request_irq(dev->host_irq, kvm_assigned_dev_intr,
-			0, "kvm_assigned_intx_device", (void *)dev))
-		return -EIO;
-	return 0;
-}
-
-#ifdef __KVM_HAVE_MSI
-static int assigned_device_enable_host_msi(struct kvm *kvm,
-					   struct kvm_assigned_dev_kernel *dev)
-{
-	int r;
-
-	if (!dev->dev->msi_enabled) {
-		r = pci_enable_msi(dev->dev);
-		if (r)
-			return r;
-	}
-
-	dev->host_irq = dev->dev->irq;
-	if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0,
-			"kvm_assigned_msi_device", (void *)dev)) {
-		pci_disable_msi(dev->dev);
-		return -EIO;
-	}
-
-	return 0;
-}
-#endif
-
-#ifdef __KVM_HAVE_MSIX
-static int assigned_device_enable_host_msix(struct kvm *kvm,
-					    struct kvm_assigned_dev_kernel *dev)
-{
-	int i, r = -EINVAL;
-
-	/* host_msix_entries and guest_msix_entries should have been
-	 * initialized */
-	if (dev->entries_nr == 0)
-		return r;
-
-	r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr);
-	if (r)
-		return r;
-
-	for (i = 0; i < dev->entries_nr; i++) {
-		r = request_irq(dev->host_msix_entries[i].vector,
-				kvm_assigned_dev_intr, 0,
-				"kvm_assigned_msix_device",
-				(void *)dev);
-		/* FIXME: free requested_irq's on failure */
-		if (r)
-			return r;
-	}
-
-	return 0;
-}
-
-#endif
-
-static int assigned_device_enable_guest_intx(struct kvm *kvm,
-				struct kvm_assigned_dev_kernel *dev,
-				struct kvm_assigned_irq *irq)
-{
-	dev->guest_irq = irq->guest_irq;
-	dev->ack_notifier.gsi = irq->guest_irq;
-	return 0;
-}
-
-#ifdef __KVM_HAVE_MSI
-static int assigned_device_enable_guest_msi(struct kvm *kvm,
-			struct kvm_assigned_dev_kernel *dev,
-			struct kvm_assigned_irq *irq)
-{
-	dev->guest_irq = irq->guest_irq;
-	dev->ack_notifier.gsi = -1;
-	dev->host_irq_disabled = false;
-	return 0;
-}
-#endif
-#ifdef __KVM_HAVE_MSIX
-static int assigned_device_enable_guest_msix(struct kvm *kvm,
-			struct kvm_assigned_dev_kernel *dev,
-			struct kvm_assigned_irq *irq)
-{
-	dev->guest_irq = irq->guest_irq;
-	dev->ack_notifier.gsi = -1;
-	dev->host_irq_disabled = false;
-	return 0;
-}
-#endif
-
-static int assign_host_irq(struct kvm *kvm,
-			   struct kvm_assigned_dev_kernel *dev,
-			   __u32 host_irq_type)
-{
-	int r = -EEXIST;
-
-	if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
-		return r;
-
-	switch (host_irq_type) {
-	case KVM_DEV_IRQ_HOST_INTX:
-		r = assigned_device_enable_host_intx(kvm, dev);
-		break;
-#ifdef __KVM_HAVE_MSI
-	case KVM_DEV_IRQ_HOST_MSI:
-		r = assigned_device_enable_host_msi(kvm, dev);
-		break;
-#endif
-#ifdef __KVM_HAVE_MSIX
-	case KVM_DEV_IRQ_HOST_MSIX:
-		r = assigned_device_enable_host_msix(kvm, dev);
-		break;
-#endif
-	default:
-		r = -EINVAL;
-	}
-
-	if (!r)
-		dev->irq_requested_type |= host_irq_type;
-
-	return r;
-}
-
-static int assign_guest_irq(struct kvm *kvm,
-			    struct kvm_assigned_dev_kernel *dev,
-			    struct kvm_assigned_irq *irq,
-			    unsigned long guest_irq_type)
-{
-	int id;
-	int r = -EEXIST;
-
-	if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
-		return r;
-
-	id = kvm_request_irq_source_id(kvm);
-	if (id < 0)
-		return id;
-
-	dev->irq_source_id = id;
-
-	switch (guest_irq_type) {
-	case KVM_DEV_IRQ_GUEST_INTX:
-		r = assigned_device_enable_guest_intx(kvm, dev, irq);
-		break;
-#ifdef __KVM_HAVE_MSI
-	case KVM_DEV_IRQ_GUEST_MSI:
-		r = assigned_device_enable_guest_msi(kvm, dev, irq);
-		break;
-#endif
-#ifdef __KVM_HAVE_MSIX
-	case KVM_DEV_IRQ_GUEST_MSIX:
-		r = assigned_device_enable_guest_msix(kvm, dev, irq);
-		break;
-#endif
-	default:
-		r = -EINVAL;
-	}
-
-	if (!r) {
-		dev->irq_requested_type |= guest_irq_type;
-		kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
-	} else
-		kvm_free_irq_source_id(kvm, dev->irq_source_id);
-
-	return r;
-}
-
-/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
-static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
-				   struct kvm_assigned_irq *assigned_irq)
-{
-	int r = -EINVAL;
-	struct kvm_assigned_dev_kernel *match;
-	unsigned long host_irq_type, guest_irq_type;
-
-	if (!capable(CAP_SYS_RAWIO))
-		return -EPERM;
-
-	if (!irqchip_in_kernel(kvm))
-		return r;
-
-	mutex_lock(&kvm->lock);
-	r = -ENODEV;
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_irq->assigned_dev_id);
-	if (!match)
-		goto out;
-
-	host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
-	guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
-
-	r = -EINVAL;
-	/* can only assign one type at a time */
-	if (hweight_long(host_irq_type) > 1)
-		goto out;
-	if (hweight_long(guest_irq_type) > 1)
-		goto out;
-	if (host_irq_type == 0 && guest_irq_type == 0)
-		goto out;
-
-	r = 0;
-	if (host_irq_type)
-		r = assign_host_irq(kvm, match, host_irq_type);
-	if (r)
-		goto out;
-
-	if (guest_irq_type)
-		r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
-out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
-					 struct kvm_assigned_irq
-					 *assigned_irq)
-{
-	int r = -ENODEV;
-	struct kvm_assigned_dev_kernel *match;
-
-	mutex_lock(&kvm->lock);
-
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_irq->assigned_dev_id);
-	if (!match)
-		goto out;
-
-	r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
-out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
-				      struct kvm_assigned_pci_dev *assigned_dev)
-{
-	int r = 0;
-	struct kvm_assigned_dev_kernel *match;
-	struct pci_dev *dev;
-
-	down_read(&kvm->slots_lock);
-	mutex_lock(&kvm->lock);
-
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_dev->assigned_dev_id);
-	if (match) {
-		/* device already assigned */
-		r = -EEXIST;
-		goto out;
-	}
-
-	match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
-	if (match == NULL) {
-		printk(KERN_INFO "%s: Couldn't allocate memory\n",
-		       __func__);
-		r = -ENOMEM;
-		goto out;
-	}
-	dev = pci_get_bus_and_slot(assigned_dev->busnr,
-				   assigned_dev->devfn);
-	if (!dev) {
-		printk(KERN_INFO "%s: host device not found\n", __func__);
-		r = -EINVAL;
-		goto out_free;
-	}
-	if (pci_enable_device(dev)) {
-		printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
-		r = -EBUSY;
-		goto out_put;
-	}
-	r = pci_request_regions(dev, "kvm_assigned_device");
-	if (r) {
-		printk(KERN_INFO "%s: Could not get access to device regions\n",
-		       __func__);
-		goto out_disable;
-	}
-
-	pci_reset_function(dev);
-
-	match->assigned_dev_id = assigned_dev->assigned_dev_id;
-	match->host_busnr = assigned_dev->busnr;
-	match->host_devfn = assigned_dev->devfn;
-	match->flags = assigned_dev->flags;
-	match->dev = dev;
-	spin_lock_init(&match->assigned_dev_lock);
-	match->irq_source_id = -1;
-	match->kvm = kvm;
-	match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
-	INIT_WORK(&match->interrupt_work,
-		  kvm_assigned_dev_interrupt_work_handler);
-
-	list_add(&match->list, &kvm->arch.assigned_dev_head);
-
-	if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
-		if (!kvm->arch.iommu_domain) {
-			r = kvm_iommu_map_guest(kvm);
-			if (r)
-				goto out_list_del;
-		}
-		r = kvm_assign_device(kvm, match);
-		if (r)
-			goto out_list_del;
-	}
-
-out:
-	mutex_unlock(&kvm->lock);
-	up_read(&kvm->slots_lock);
-	return r;
-out_list_del:
-	list_del(&match->list);
-	pci_release_regions(dev);
-out_disable:
-	pci_disable_device(dev);
-out_put:
-	pci_dev_put(dev);
-out_free:
-	kfree(match);
-	mutex_unlock(&kvm->lock);
-	up_read(&kvm->slots_lock);
-	return r;
-}
-#endif
-
-#ifdef KVM_CAP_DEVICE_DEASSIGNMENT
-static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
-		struct kvm_assigned_pci_dev *assigned_dev)
-{
-	int r = 0;
-	struct kvm_assigned_dev_kernel *match;
-
-	mutex_lock(&kvm->lock);
-
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_dev->assigned_dev_id);
-	if (!match) {
-		printk(KERN_INFO "%s: device hasn't been assigned before, "
-		  "so cannot be deassigned\n", __func__);
-		r = -EINVAL;
-		goto out;
-	}
-
-	if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)
-		kvm_deassign_device(kvm, match);
-
-	kvm_free_assigned_device(kvm, match);
-
-out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-#endif
-
 inline int kvm_is_mmio_pfn(pfn_t pfn)
 {
 	if (pfn_valid(pfn)) {
@@ -1824,88 +1216,6 @@ static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
 	return 0;
 }
 
-#ifdef __KVM_HAVE_MSIX
-static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
-				    struct kvm_assigned_msix_nr *entry_nr)
-{
-	int r = 0;
-	struct kvm_assigned_dev_kernel *adev;
-
-	mutex_lock(&kvm->lock);
-
-	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      entry_nr->assigned_dev_id);
-	if (!adev) {
-		r = -EINVAL;
-		goto msix_nr_out;
-	}
-
-	if (adev->entries_nr == 0) {
-		adev->entries_nr = entry_nr->entry_nr;
-		if (adev->entries_nr == 0 ||
-		    adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) {
-			r = -EINVAL;
-			goto msix_nr_out;
-		}
-
-		adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
-						entry_nr->entry_nr,
-						GFP_KERNEL);
-		if (!adev->host_msix_entries) {
-			r = -ENOMEM;
-			goto msix_nr_out;
-		}
-		adev->guest_msix_entries = kzalloc(
-				sizeof(struct kvm_guest_msix_entry) *
-				entry_nr->entry_nr, GFP_KERNEL);
-		if (!adev->guest_msix_entries) {
-			kfree(adev->host_msix_entries);
-			r = -ENOMEM;
-			goto msix_nr_out;
-		}
-	} else /* Not allowed set MSI-X number twice */
-		r = -EINVAL;
-msix_nr_out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
-				       struct kvm_assigned_msix_entry *entry)
-{
-	int r = 0, i;
-	struct kvm_assigned_dev_kernel *adev;
-
-	mutex_lock(&kvm->lock);
-
-	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      entry->assigned_dev_id);
-
-	if (!adev) {
-		r = -EINVAL;
-		goto msix_entry_out;
-	}
-
-	for (i = 0; i < adev->entries_nr; i++)
-		if (adev->guest_msix_entries[i].vector == 0 ||
-		    adev->guest_msix_entries[i].entry == entry->entry) {
-			adev->guest_msix_entries[i].entry = entry->entry;
-			adev->guest_msix_entries[i].vector = entry->gsi;
-			adev->host_msix_entries[i].entry = entry->entry;
-			break;
-		}
-	if (i == adev->entries_nr) {
-		r = -ENOSPC;
-		goto msix_entry_out;
-	}
-
-msix_entry_out:
-	mutex_unlock(&kvm->lock);
-
-	return r;
-}
-#endif
-
 static long kvm_vcpu_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
@@ -2163,112 +1473,6 @@ static long kvm_vm_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
-#endif
-#ifdef KVM_CAP_DEVICE_ASSIGNMENT
-	case KVM_ASSIGN_PCI_DEVICE: {
-		struct kvm_assigned_pci_dev assigned_dev;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
-			goto out;
-		r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_ASSIGN_IRQ: {
-		r = -EOPNOTSUPP;
-		break;
-	}
-#ifdef KVM_CAP_ASSIGN_DEV_IRQ
-	case KVM_ASSIGN_DEV_IRQ: {
-		struct kvm_assigned_irq assigned_irq;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
-			goto out;
-		r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_DEASSIGN_DEV_IRQ: {
-		struct kvm_assigned_irq assigned_irq;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
-			goto out;
-		r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
-		if (r)
-			goto out;
-		break;
-	}
-#endif
-#endif
-#ifdef KVM_CAP_DEVICE_DEASSIGNMENT
-	case KVM_DEASSIGN_PCI_DEVICE: {
-		struct kvm_assigned_pci_dev assigned_dev;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
-			goto out;
-		r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
-		if (r)
-			goto out;
-		break;
-	}
-#endif
-#ifdef KVM_CAP_IRQ_ROUTING
-	case KVM_SET_GSI_ROUTING: {
-		struct kvm_irq_routing routing;
-		struct kvm_irq_routing __user *urouting;
-		struct kvm_irq_routing_entry *entries;
-
-		r = -EFAULT;
-		if (copy_from_user(&routing, argp, sizeof(routing)))
-			goto out;
-		r = -EINVAL;
-		if (routing.nr >= KVM_MAX_IRQ_ROUTES)
-			goto out;
-		if (routing.flags)
-			goto out;
-		r = -ENOMEM;
-		entries = vmalloc(routing.nr * sizeof(*entries));
-		if (!entries)
-			goto out;
-		r = -EFAULT;
-		urouting = argp;
-		if (copy_from_user(entries, urouting->entries,
-				   routing.nr * sizeof(*entries)))
-			goto out_free_irq_routing;
-		r = kvm_set_irq_routing(kvm, entries, routing.nr,
-					routing.flags);
-	out_free_irq_routing:
-		vfree(entries);
-		break;
-	}
-#endif /* KVM_CAP_IRQ_ROUTING */
-#ifdef __KVM_HAVE_MSIX
-	case KVM_ASSIGN_SET_MSIX_NR: {
-		struct kvm_assigned_msix_nr entry_nr;
-		r = -EFAULT;
-		if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
-			goto out;
-		r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_ASSIGN_SET_MSIX_ENTRY: {
-		struct kvm_assigned_msix_entry entry;
-		r = -EFAULT;
-		if (copy_from_user(&entry, argp, sizeof entry))
-			goto out;
-		r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
-		if (r)
-			goto out;
-		break;
-	}
 #endif
 	case KVM_IRQFD: {
 		struct kvm_irqfd data;
@@ -2301,6 +1505,8 @@ static long kvm_vm_ioctl(struct file *filp,
 #endif
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
+		if (r == -ENOTTY)
+			r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
 	}
 out:
 	return r;
-- 
cgit v1.2.3


From 10474ae8945ce08622fd1f3464e55bd817bf2376 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 15 Sep 2009 11:37:46 +0200
Subject: KVM: Activate Virtualization On Demand

X86 CPUs need to have some magic happening to enable the virtualization
extensions on them. This magic can result in unpleasant results for
users, like blocking other VMMs from working (vmx) or using invalid TLB
entries (svm).

Currently KVM activates virtualization when the respective kernel module
is loaded. This blocks us from autoloading KVM modules without breaking
other VMMs.

To circumvent this problem at least a bit, this patch introduces on
demand activation of virtualization. This means, that instead
virtualization is enabled on creation of the first virtual machine
and disabled on destruction of the last one.

So using this, KVM can be easily autoloaded, while keeping other
hypervisors usable.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c        |  8 ++--
 arch/powerpc/kvm/powerpc.c      |  3 +-
 arch/s390/kvm/kvm-s390.c        |  3 +-
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/svm.c              | 13 ++++--
 arch/x86/kvm/vmx.c              | 11 +++--
 arch/x86/kvm/x86.c              |  4 +-
 include/linux/kvm_host.h        |  2 +-
 virt/kvm/kvm_main.c             | 90 +++++++++++++++++++++++++++++++++++------
 9 files changed, 108 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index f6471c882667..5fdeec5fddcf 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -124,7 +124,7 @@ long ia64_pal_vp_create(u64 *vpd, u64 *host_iva, u64 *opt_handler)
 
 static  DEFINE_SPINLOCK(vp_lock);
 
-void kvm_arch_hardware_enable(void *garbage)
+int kvm_arch_hardware_enable(void *garbage)
 {
 	long  status;
 	long  tmp_base;
@@ -137,7 +137,7 @@ void kvm_arch_hardware_enable(void *garbage)
 	slot = ia64_itr_entry(0x3, KVM_VMM_BASE, pte, KVM_VMM_SHIFT);
 	local_irq_restore(saved_psr);
 	if (slot < 0)
-		return;
+		return -EINVAL;
 
 	spin_lock(&vp_lock);
 	status = ia64_pal_vp_init_env(kvm_vsa_base ?
@@ -145,7 +145,7 @@ void kvm_arch_hardware_enable(void *garbage)
 			__pa(kvm_vm_buffer), KVM_VM_BUFFER_BASE, &tmp_base);
 	if (status != 0) {
 		printk(KERN_WARNING"kvm: Failed to Enable VT Support!!!!\n");
-		return ;
+		return -EINVAL;
 	}
 
 	if (!kvm_vsa_base) {
@@ -154,6 +154,8 @@ void kvm_arch_hardware_enable(void *garbage)
 	}
 	spin_unlock(&vp_lock);
 	ia64_ptr_entry(0x3, slot);
+
+	return 0;
 }
 
 void kvm_arch_hardware_disable(void *garbage)
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 95af62217b6b..5902bbc2411e 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -78,8 +78,9 @@ int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	return r;
 }
 
-void kvm_arch_hardware_enable(void *garbage)
+int kvm_arch_hardware_enable(void *garbage)
 {
+	return 0;
 }
 
 void kvm_arch_hardware_disable(void *garbage)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 00e2ce8e91f5..544505893c9f 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -74,9 +74,10 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 static unsigned long long *facilities;
 
 /* Section: not file related */
-void kvm_arch_hardware_enable(void *garbage)
+int kvm_arch_hardware_enable(void *garbage)
 {
 	/* every s390 is virtualization enabled ;-) */
+	return 0;
 }
 
 void kvm_arch_hardware_disable(void *garbage)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a46e2dd9aca8..295c7c4d9c90 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -459,7 +459,7 @@ struct descriptor_table {
 struct kvm_x86_ops {
 	int (*cpu_has_kvm_support)(void);          /* __init */
 	int (*disabled_by_bios)(void);             /* __init */
-	void (*hardware_enable)(void *dummy);      /* __init */
+	int (*hardware_enable)(void *dummy);
 	void (*hardware_disable)(void *dummy);
 	void (*check_processor_compatibility)(void *rtn);
 	int (*hardware_setup)(void);               /* __init */
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f54c4f9d2865..59fe4d54da11 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -316,7 +316,7 @@ static void svm_hardware_disable(void *garbage)
 	cpu_svm_disable();
 }
 
-static void svm_hardware_enable(void *garbage)
+static int svm_hardware_enable(void *garbage)
 {
 
 	struct svm_cpu_data *svm_data;
@@ -325,16 +325,20 @@ static void svm_hardware_enable(void *garbage)
 	struct desc_struct *gdt;
 	int me = raw_smp_processor_id();
 
+	rdmsrl(MSR_EFER, efer);
+	if (efer & EFER_SVME)
+		return -EBUSY;
+
 	if (!has_svm()) {
 		printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me);
-		return;
+		return -EINVAL;
 	}
 	svm_data = per_cpu(svm_data, me);
 
 	if (!svm_data) {
 		printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n",
 		       me);
-		return;
+		return -EINVAL;
 	}
 
 	svm_data->asid_generation = 1;
@@ -345,11 +349,12 @@ static void svm_hardware_enable(void *garbage)
 	gdt = (struct desc_struct *)gdt_descr.base;
 	svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 
-	rdmsrl(MSR_EFER, efer);
 	wrmsrl(MSR_EFER, efer | EFER_SVME);
 
 	wrmsrl(MSR_VM_HSAVE_PA,
 	       page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
+
+	return 0;
 }
 
 static void svm_cpu_uninit(int cpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 73cb5dd960cf..a187570e4837 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1138,12 +1138,15 @@ static __init int vmx_disabled_by_bios(void)
 	/* locked but not enabled */
 }
 
-static void hardware_enable(void *garbage)
+static int hardware_enable(void *garbage)
 {
 	int cpu = raw_smp_processor_id();
 	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
 	u64 old;
 
+	if (read_cr4() & X86_CR4_VMXE)
+		return -EBUSY;
+
 	INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
 	if ((old & (FEATURE_CONTROL_LOCKED |
@@ -1158,6 +1161,10 @@ static void hardware_enable(void *garbage)
 	asm volatile (ASM_VMX_VMXON_RAX
 		      : : "a"(&phys_addr), "m"(phys_addr)
 		      : "memory", "cc");
+
+	ept_sync_global();
+
+	return 0;
 }
 
 static void vmclear_local_vcpus(void)
@@ -4040,8 +4047,6 @@ static int __init vmx_init(void)
 	if (bypass_guest_pf)
 		kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
 
-	ept_sync_global();
-
 	return 0;
 
 out3:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 829e3063e2ab..3d83de8bcbf4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4691,9 +4691,9 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 	return kvm_x86_ops->vcpu_reset(vcpu);
 }
 
-void kvm_arch_hardware_enable(void *garbage)
+int kvm_arch_hardware_enable(void *garbage)
 {
-	kvm_x86_ops->hardware_enable(garbage);
+	return kvm_x86_ops->hardware_enable(garbage);
 }
 
 void kvm_arch_hardware_disable(void *garbage)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c0a1cc35f080..b985a29d8175 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -345,7 +345,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
 
 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
-void kvm_arch_hardware_enable(void *garbage);
+int kvm_arch_hardware_enable(void *garbage);
 void kvm_arch_hardware_disable(void *garbage);
 int kvm_arch_hardware_setup(void);
 void kvm_arch_hardware_unsetup(void);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 38e4d2c34ac1..70c8cbea0a99 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -69,6 +69,8 @@ DEFINE_SPINLOCK(kvm_lock);
 LIST_HEAD(vm_list);
 
 static cpumask_var_t cpus_hardware_enabled;
+static int kvm_usage_count = 0;
+static atomic_t hardware_enable_failed;
 
 struct kmem_cache *kvm_vcpu_cache;
 EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
@@ -79,6 +81,8 @@ struct dentry *kvm_debugfs_dir;
 
 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
 			   unsigned long arg);
+static int hardware_enable_all(void);
+static void hardware_disable_all(void);
 
 static bool kvm_rebooting;
 
@@ -339,6 +343,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 
 static struct kvm *kvm_create_vm(void)
 {
+	int r = 0;
 	struct kvm *kvm = kvm_arch_create_vm();
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
 	struct page *page;
@@ -346,6 +351,11 @@ static struct kvm *kvm_create_vm(void)
 
 	if (IS_ERR(kvm))
 		goto out;
+
+	r = hardware_enable_all();
+	if (r)
+		goto out_err_nodisable;
+
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
 	INIT_HLIST_HEAD(&kvm->mask_notifier_list);
 	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
@@ -354,8 +364,8 @@ static struct kvm *kvm_create_vm(void)
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!page) {
-		kfree(kvm);
-		return ERR_PTR(-ENOMEM);
+		r = -ENOMEM;
+		goto out_err;
 	}
 	kvm->coalesced_mmio_ring =
 			(struct kvm_coalesced_mmio_ring *)page_address(page);
@@ -363,15 +373,13 @@ static struct kvm *kvm_create_vm(void)
 
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 	{
-		int err;
 		kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
-		err = mmu_notifier_register(&kvm->mmu_notifier, current->mm);
-		if (err) {
+		r = mmu_notifier_register(&kvm->mmu_notifier, current->mm);
+		if (r) {
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
 			put_page(page);
 #endif
-			kfree(kvm);
-			return ERR_PTR(err);
+			goto out_err;
 		}
 	}
 #endif
@@ -395,6 +403,12 @@ static struct kvm *kvm_create_vm(void)
 #endif
 out:
 	return kvm;
+
+out_err:
+	hardware_disable_all();
+out_err_nodisable:
+	kfree(kvm);
+	return ERR_PTR(r);
 }
 
 /*
@@ -453,6 +467,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	kvm_arch_flush_shadow(kvm);
 #endif
 	kvm_arch_destroy_vm(kvm);
+	hardware_disable_all();
 	mmdrop(mm);
 }
 
@@ -1644,11 +1659,21 @@ static struct miscdevice kvm_dev = {
 static void hardware_enable(void *junk)
 {
 	int cpu = raw_smp_processor_id();
+	int r;
 
 	if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
 		return;
+
 	cpumask_set_cpu(cpu, cpus_hardware_enabled);
-	kvm_arch_hardware_enable(NULL);
+
+	r = kvm_arch_hardware_enable(NULL);
+
+	if (r) {
+		cpumask_clear_cpu(cpu, cpus_hardware_enabled);
+		atomic_inc(&hardware_enable_failed);
+		printk(KERN_INFO "kvm: enabling virtualization on "
+				 "CPU%d failed\n", cpu);
+	}
 }
 
 static void hardware_disable(void *junk)
@@ -1661,11 +1686,52 @@ static void hardware_disable(void *junk)
 	kvm_arch_hardware_disable(NULL);
 }
 
+static void hardware_disable_all_nolock(void)
+{
+	BUG_ON(!kvm_usage_count);
+
+	kvm_usage_count--;
+	if (!kvm_usage_count)
+		on_each_cpu(hardware_disable, NULL, 1);
+}
+
+static void hardware_disable_all(void)
+{
+	spin_lock(&kvm_lock);
+	hardware_disable_all_nolock();
+	spin_unlock(&kvm_lock);
+}
+
+static int hardware_enable_all(void)
+{
+	int r = 0;
+
+	spin_lock(&kvm_lock);
+
+	kvm_usage_count++;
+	if (kvm_usage_count == 1) {
+		atomic_set(&hardware_enable_failed, 0);
+		on_each_cpu(hardware_enable, NULL, 1);
+
+		if (atomic_read(&hardware_enable_failed)) {
+			hardware_disable_all_nolock();
+			r = -EBUSY;
+		}
+	}
+
+	spin_unlock(&kvm_lock);
+
+	return r;
+}
+
 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
 			   void *v)
 {
 	int cpu = (long)v;
 
+	if (!kvm_usage_count)
+		return NOTIFY_OK;
+
 	val &= ~CPU_TASKS_FROZEN;
 	switch (val) {
 	case CPU_DYING:
@@ -1868,13 +1934,15 @@ static void kvm_exit_debug(void)
 
 static int kvm_suspend(struct sys_device *dev, pm_message_t state)
 {
-	hardware_disable(NULL);
+	if (kvm_usage_count)
+		hardware_disable(NULL);
 	return 0;
 }
 
 static int kvm_resume(struct sys_device *dev)
 {
-	hardware_enable(NULL);
+	if (kvm_usage_count)
+		hardware_enable(NULL);
 	return 0;
 }
 
@@ -1949,7 +2017,6 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
 			goto out_free_1;
 	}
 
-	on_each_cpu(hardware_enable, NULL, 1);
 	r = register_cpu_notifier(&kvm_cpu_notifier);
 	if (r)
 		goto out_free_2;
@@ -1999,7 +2066,6 @@ out_free_3:
 	unregister_reboot_notifier(&kvm_reboot_notifier);
 	unregister_cpu_notifier(&kvm_cpu_notifier);
 out_free_2:
-	on_each_cpu(hardware_disable, NULL, 1);
 out_free_1:
 	kvm_arch_hardware_unsetup();
 out_free_0a:
-- 
cgit v1.2.3


From d255f4f2bac81eb798fcf76938147f1f6c756ae2 Mon Sep 17 00:00:00 2001
From: "Zhai, Edwin" <edwin.zhai@intel.com>
Date: Fri, 9 Oct 2009 18:03:20 +0800
Subject: KVM: introduce kvm_vcpu_on_spin

Introduce kvm_vcpu_on_spin, to be used by VMX/SVM to yield processing
once the cpu detects pause-based looping.

Signed-off-by: "Zhai, Edwin" <edwin.zhai@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 include/linux/kvm_host.h |  1 +
 virt/kvm/kvm_main.c      | 15 +++++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b985a29d8175..bd5a616d9373 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -286,6 +286,7 @@ int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
 void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
 
 void kvm_vcpu_block(struct kvm_vcpu *vcpu);
+void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
 void kvm_resched(struct kvm_vcpu *vcpu);
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 70c8cbea0a99..cac69c4415df 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1108,6 +1108,21 @@ void kvm_resched(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_resched);
 
+void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu)
+{
+	ktime_t expires;
+	DEFINE_WAIT(wait);
+
+	prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
+
+	/* Sleep for 100 us, and hope lock-holder got scheduled */
+	expires = ktime_add_ns(ktime_get(), 100000UL);
+	schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+
+	finish_wait(&vcpu->wq, &wait);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
+
 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct kvm_vcpu *vcpu = vma->vm_file->private_data;
-- 
cgit v1.2.3


From ffde22ac53b6d6b1d7206f1172176a667eead778 Mon Sep 17 00:00:00 2001
From: Ed Swierk <eswierk@aristanetworks.com>
Date: Thu, 15 Oct 2009 15:21:43 -0700
Subject: KVM: Xen PV-on-HVM guest support

Support for Xen PV-on-HVM guests can be implemented almost entirely in
userspace, except for handling one annoying MSR that maps a Xen
hypercall blob into guest address space.

A generic mechanism to delegate MSR writes to userspace seems overkill
and risks encouraging similar MSR abuse in the future.  Thus this patch
adds special support for the Xen HVM MSR.

I implemented a new ioctl, KVM_XEN_HVM_CONFIG, that lets userspace tell
KVM which MSR the guest will write to, as well as the starting address
and size of the hypercall blobs (one each for 32-bit and 64-bit) that
userspace has loaded from files.  When the guest writes to the MSR, KVM
copies one page of the blob from userspace to the guest.

I've tested this patch with a hacked-up version of Gerd's userspace
code, booting a number of guests (CentOS 5.3 i386 and x86_64, and
FreeBSD 8.0-RC1 amd64) and exercising PV network and block devices.

[jan: fix i386 build warning]
[avi: future proof abi with a flags field]

Signed-off-by: Ed Swierk <eswierk@aristanetworks.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/kvm/api.txt       | 24 +++++++++++++++++++++
 arch/x86/include/asm/kvm.h      |  1 +
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/x86.c              | 46 +++++++++++++++++++++++++++++++++++++++++
 include/linux/kvm.h             | 16 ++++++++++++++
 5 files changed, 89 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index 5a4bc8cf6d04..3e8684e48506 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -593,6 +593,30 @@ struct kvm_irqchip {
 	} chip;
 };
 
+4.27 KVM_XEN_HVM_CONFIG
+
+Capability: KVM_CAP_XEN_HVM
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_xen_hvm_config (in)
+Returns: 0 on success, -1 on error
+
+Sets the MSR that the Xen HVM guest uses to initialize its hypercall
+page, and provides the starting address and size of the hypercall
+blobs in userspace.  When the guest writes the MSR, kvm copies one
+page of a blob (32- or 64-bit, depending on the vcpu mode) to guest
+memory.
+
+struct kvm_xen_hvm_config {
+	__u32 flags;
+	__u32 msr;
+	__u64 blob_addr_32;
+	__u64 blob_addr_64;
+	__u8 blob_size_32;
+	__u8 blob_size_64;
+	__u8 pad2[30];
+};
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index f02e87a5206f..ef9b4b73cce4 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -19,6 +19,7 @@
 #define __KVM_HAVE_MSIX
 #define __KVM_HAVE_MCE
 #define __KVM_HAVE_PIT_STATE2
+#define __KVM_HAVE_XEN_HVM
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 179a919f53a4..36f3b53f5c27 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -410,6 +410,8 @@ struct kvm_arch{
 
 	unsigned long irq_sources_bitmap;
 	u64 vm_init_tsc;
+
+	struct kvm_xen_hvm_config xen_hvm_config;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5d450cc6f841..bb842db3ee7c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -857,6 +857,38 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	return 0;
 }
 
+static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
+{
+	struct kvm *kvm = vcpu->kvm;
+	int lm = is_long_mode(vcpu);
+	u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
+		: (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
+	u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
+		: kvm->arch.xen_hvm_config.blob_size_32;
+	u32 page_num = data & ~PAGE_MASK;
+	u64 page_addr = data & PAGE_MASK;
+	u8 *page;
+	int r;
+
+	r = -E2BIG;
+	if (page_num >= blob_size)
+		goto out;
+	r = -ENOMEM;
+	page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!page)
+		goto out;
+	r = -EFAULT;
+	if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
+		goto out_free;
+	if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
+		goto out_free;
+	r = 0;
+out_free:
+	kfree(page);
+out:
+	return r;
+}
+
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
 	switch (msr) {
@@ -972,6 +1004,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 			"0x%x data 0x%llx\n", msr, data);
 		break;
 	default:
+		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
+			return xen_hvm_config(vcpu, data);
 		if (!ignore_msrs) {
 			pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
 				msr, data);
@@ -1246,6 +1280,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PIT2:
 	case KVM_CAP_PIT_STATE2:
 	case KVM_CAP_SET_IDENTITY_MAP_ADDR:
+	case KVM_CAP_XEN_HVM:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -2441,6 +2476,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_XEN_HVM_CONFIG: {
+		r = -EFAULT;
+		if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
+				   sizeof(struct kvm_xen_hvm_config)))
+			goto out;
+		r = -EINVAL;
+		if (kvm->arch.xen_hvm_config.flags)
+			goto out;
+		r = 0;
+		break;
+	}
 	default:
 		;
 	}
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index f8f8900fc5ec..b694c1d2f918 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -436,6 +436,9 @@ struct kvm_ioeventfd {
 #endif
 #define KVM_CAP_IOEVENTFD 36
 #define KVM_CAP_SET_IDENTITY_MAP_ADDR 37
+#ifdef __KVM_HAVE_XEN_HVM
+#define KVM_CAP_XEN_HVM 38
+#endif
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -488,6 +491,18 @@ struct kvm_x86_mce {
 };
 #endif
 
+#ifdef KVM_CAP_XEN_HVM
+struct kvm_xen_hvm_config {
+	__u32 flags;
+	__u32 msr;
+	__u64 blob_addr_32;
+	__u64 blob_addr_64;
+	__u8 blob_size_32;
+	__u8 blob_size_64;
+	__u8 pad2[30];
+};
+#endif
+
 #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
 
 struct kvm_irqfd {
@@ -546,6 +561,7 @@ struct kvm_irqfd {
 #define KVM_CREATE_PIT2		   _IOW(KVMIO, 0x77, struct kvm_pit_config)
 #define KVM_SET_BOOT_CPU_ID        _IO(KVMIO, 0x78)
 #define KVM_IOEVENTFD             _IOW(KVMIO, 0x79, struct kvm_ioeventfd)
+#define KVM_XEN_HVM_CONFIG        _IOW(KVMIO, 0x7a, struct kvm_xen_hvm_config)
 
 /*
  * ioctls for vcpu fds
-- 
cgit v1.2.3


From afbcf7ab8d1bc8c2d04792f6d9e786e0adeb328d Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Fri, 16 Oct 2009 15:28:36 -0400
Subject: KVM: allow userspace to adjust kvmclock offset

When we migrate a kvm guest that uses pvclock between two hosts, we may
suffer a large skew. This is because there can be significant differences
between the monotonic clock of the hosts involved. When a new host with
a much larger monotonic time starts running the guest, the view of time
will be significantly impacted.

Situation is much worse when we do the opposite, and migrate to a host with
a smaller monotonic clock.

This proposed ioctl will allow userspace to inform us what is the monotonic
clock value in the source host, so we can keep the time skew short, and
more importantly, never goes backwards. Userspace may also need to trigger
the current data, since from the first migration onwards, it won't be
reflected by a simple call to clock_gettime() anymore.

[marcelo: future-proof abi with a flags field]
[jan: fix KVM_GET_CLOCK by clearing flags field instead of checking it]

Signed-off-by: Glauber Costa <glommer@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/kvm/api.txt       | 36 +++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/x86.c              | 42 ++++++++++++++++++++++++++++++++++++++++-
 include/linux/kvm.h             | 10 ++++++++++
 4 files changed, 88 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index 3e8684e48506..36594ba57723 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -617,6 +617,42 @@ struct kvm_xen_hvm_config {
 	__u8 pad2[30];
 };
 
+4.27 KVM_GET_CLOCK
+
+Capability: KVM_CAP_ADJUST_CLOCK
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_clock_data (out)
+Returns: 0 on success, -1 on error
+
+Gets the current timestamp of kvmclock as seen by the current guest. In
+conjunction with KVM_SET_CLOCK, it is used to ensure monotonicity on scenarios
+such as migration.
+
+struct kvm_clock_data {
+	__u64 clock;  /* kvmclock current value */
+	__u32 flags;
+	__u32 pad[9];
+};
+
+4.28 KVM_SET_CLOCK
+
+Capability: KVM_CAP_ADJUST_CLOCK
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_clock_data (in)
+Returns: 0 on success, -1 on error
+
+Sets the current timestamp of kvmclock to the valued specific in its parameter.
+In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios
+such as migration.
+
+struct kvm_clock_data {
+	__u64 clock;  /* kvmclock current value */
+	__u32 flags;
+	__u32 pad[9];
+};
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4d994ad5051a..0558ff8c32ae 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -413,6 +413,7 @@ struct kvm_arch{
 
 	unsigned long irq_sources_bitmap;
 	u64 vm_init_tsc;
+	s64 kvmclock_offset;
 
 	struct kvm_xen_hvm_config xen_hvm_config;
 };
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 13f30aac460b..e16cdc9ec0c1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -680,7 +680,8 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	/* With all the info we got, fill in the values */
 
 	vcpu->hv_clock.system_time = ts.tv_nsec +
-				     (NSEC_PER_SEC * (u64)ts.tv_sec);
+				     (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
+
 	/*
 	 * The interface expects us to write an even number signaling that the
 	 * update is finished. Since the guest won't see the intermediate
@@ -1262,6 +1263,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PIT_STATE2:
 	case KVM_CAP_SET_IDENTITY_MAP_ADDR:
 	case KVM_CAP_XEN_HVM:
+	case KVM_CAP_ADJUST_CLOCK:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -2468,6 +2470,44 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_SET_CLOCK: {
+		struct timespec now;
+		struct kvm_clock_data user_ns;
+		u64 now_ns;
+		s64 delta;
+
+		r = -EFAULT;
+		if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
+			goto out;
+
+		r = -EINVAL;
+		if (user_ns.flags)
+			goto out;
+
+		r = 0;
+		ktime_get_ts(&now);
+		now_ns = timespec_to_ns(&now);
+		delta = user_ns.clock - now_ns;
+		kvm->arch.kvmclock_offset = delta;
+		break;
+	}
+	case KVM_GET_CLOCK: {
+		struct timespec now;
+		struct kvm_clock_data user_ns;
+		u64 now_ns;
+
+		ktime_get_ts(&now);
+		now_ns = timespec_to_ns(&now);
+		user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
+		user_ns.flags = 0;
+
+		r = -EFAULT;
+		if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
+			goto out;
+		r = 0;
+		break;
+	}
+
 	default:
 		;
 	}
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index b694c1d2f918..6ed1a12ed526 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -439,6 +439,7 @@ struct kvm_ioeventfd {
 #ifdef __KVM_HAVE_XEN_HVM
 #define KVM_CAP_XEN_HVM 38
 #endif
+#define KVM_CAP_ADJUST_CLOCK 39
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -512,6 +513,12 @@ struct kvm_irqfd {
 	__u8  pad[20];
 };
 
+struct kvm_clock_data {
+	__u64 clock;
+	__u32 flags;
+	__u32 pad[9];
+};
+
 /*
  * ioctls for VM fds
  */
@@ -562,6 +569,9 @@ struct kvm_irqfd {
 #define KVM_SET_BOOT_CPU_ID        _IO(KVMIO, 0x78)
 #define KVM_IOEVENTFD             _IOW(KVMIO, 0x79, struct kvm_ioeventfd)
 #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO, 0x7a, struct kvm_xen_hvm_config)
+#define KVM_SET_CLOCK             _IOW(KVMIO, 0x7b, struct kvm_clock_data)
+#define KVM_GET_CLOCK             _IOR(KVMIO, 0x7c, struct kvm_clock_data)
+
 
 /*
  * ioctls for vcpu fds
-- 
cgit v1.2.3


From c54d2aba27f0c505d61700d656c5943e96982e60 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 2 Nov 2009 17:20:28 +0100
Subject: KVM: Reorder IOCTLs in main kvm.h

Obviously, people tend to extend this header at the bottom - more or
less blindly. Ensure that deprecated stuff gets its own corner again by
moving things to the top. Also add some comments and reindent IOCTLs to
make them more readable and reduce the risk of number collisions.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm.h | 235 ++++++++++++++++++++++++++--------------------------
 1 file changed, 117 insertions(+), 118 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 6ed1a12ed526..ca62b8e056f9 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -14,12 +14,76 @@
 
 #define KVM_API_VERSION 12
 
-/* for KVM_TRACE_ENABLE, deprecated */
+/* *** Deprecated interfaces *** */
+
+#define KVM_TRC_SHIFT           16
+
+#define KVM_TRC_ENTRYEXIT       (1 << KVM_TRC_SHIFT)
+#define KVM_TRC_HANDLER         (1 << (KVM_TRC_SHIFT + 1))
+
+#define KVM_TRC_VMENTRY         (KVM_TRC_ENTRYEXIT + 0x01)
+#define KVM_TRC_VMEXIT          (KVM_TRC_ENTRYEXIT + 0x02)
+#define KVM_TRC_PAGE_FAULT      (KVM_TRC_HANDLER + 0x01)
+
+#define KVM_TRC_HEAD_SIZE       12
+#define KVM_TRC_CYCLE_SIZE      8
+#define KVM_TRC_EXTRA_MAX       7
+
+#define KVM_TRC_INJ_VIRQ         (KVM_TRC_HANDLER + 0x02)
+#define KVM_TRC_REDELIVER_EVT    (KVM_TRC_HANDLER + 0x03)
+#define KVM_TRC_PEND_INTR        (KVM_TRC_HANDLER + 0x04)
+#define KVM_TRC_IO_READ          (KVM_TRC_HANDLER + 0x05)
+#define KVM_TRC_IO_WRITE         (KVM_TRC_HANDLER + 0x06)
+#define KVM_TRC_CR_READ          (KVM_TRC_HANDLER + 0x07)
+#define KVM_TRC_CR_WRITE         (KVM_TRC_HANDLER + 0x08)
+#define KVM_TRC_DR_READ          (KVM_TRC_HANDLER + 0x09)
+#define KVM_TRC_DR_WRITE         (KVM_TRC_HANDLER + 0x0A)
+#define KVM_TRC_MSR_READ         (KVM_TRC_HANDLER + 0x0B)
+#define KVM_TRC_MSR_WRITE        (KVM_TRC_HANDLER + 0x0C)
+#define KVM_TRC_CPUID            (KVM_TRC_HANDLER + 0x0D)
+#define KVM_TRC_INTR             (KVM_TRC_HANDLER + 0x0E)
+#define KVM_TRC_NMI              (KVM_TRC_HANDLER + 0x0F)
+#define KVM_TRC_VMMCALL          (KVM_TRC_HANDLER + 0x10)
+#define KVM_TRC_HLT              (KVM_TRC_HANDLER + 0x11)
+#define KVM_TRC_CLTS             (KVM_TRC_HANDLER + 0x12)
+#define KVM_TRC_LMSW             (KVM_TRC_HANDLER + 0x13)
+#define KVM_TRC_APIC_ACCESS      (KVM_TRC_HANDLER + 0x14)
+#define KVM_TRC_TDP_FAULT        (KVM_TRC_HANDLER + 0x15)
+#define KVM_TRC_GTLB_WRITE       (KVM_TRC_HANDLER + 0x16)
+#define KVM_TRC_STLB_WRITE       (KVM_TRC_HANDLER + 0x17)
+#define KVM_TRC_STLB_INVAL       (KVM_TRC_HANDLER + 0x18)
+#define KVM_TRC_PPC_INSTR        (KVM_TRC_HANDLER + 0x19)
+
 struct kvm_user_trace_setup {
-	__u32 buf_size; /* sub_buffer size of each per-cpu */
-	__u32 buf_nr; /* the number of sub_buffers of each per-cpu */
+	__u32 buf_size;
+	__u32 buf_nr;
+};
+
+#define __KVM_DEPRECATED_MAIN_W_0x06 \
+	_IOW(KVMIO, 0x06, struct kvm_user_trace_setup)
+#define __KVM_DEPRECATED_MAIN_0x07 _IO(KVMIO, 0x07)
+#define __KVM_DEPRECATED_MAIN_0x08 _IO(KVMIO, 0x08)
+
+#define __KVM_DEPRECATED_VM_R_0x70 _IOR(KVMIO, 0x70, struct kvm_assigned_irq)
+
+struct kvm_breakpoint {
+	__u32 enabled;
+	__u32 padding;
+	__u64 address;
+};
+
+struct kvm_debug_guest {
+	__u32 enabled;
+	__u32 pad;
+	struct kvm_breakpoint breakpoints[4];
+	__u32 singlestep;
 };
 
+#define __KVM_DEPRECATED_VCPU_W_0x87 _IOW(KVMIO, 0x87, struct kvm_debug_guest)
+
+/* *** End of deprecated interfaces *** */
+
+
 /* for KVM_CREATE_MEMORY_REGION */
 struct kvm_memory_region {
 	__u32 slot;
@@ -329,24 +393,6 @@ struct kvm_ioeventfd {
 	__u8  pad[36];
 };
 
-#define KVM_TRC_SHIFT           16
-/*
- * kvm trace categories
- */
-#define KVM_TRC_ENTRYEXIT       (1 << KVM_TRC_SHIFT)
-#define KVM_TRC_HANDLER         (1 << (KVM_TRC_SHIFT + 1)) /* only 12 bits */
-
-/*
- * kvm trace action
- */
-#define KVM_TRC_VMENTRY         (KVM_TRC_ENTRYEXIT + 0x01)
-#define KVM_TRC_VMEXIT          (KVM_TRC_ENTRYEXIT + 0x02)
-#define KVM_TRC_PAGE_FAULT      (KVM_TRC_HANDLER + 0x01)
-
-#define KVM_TRC_HEAD_SIZE       12
-#define KVM_TRC_CYCLE_SIZE      8
-#define KVM_TRC_EXTRA_MAX       7
-
 #define KVMIO 0xAE
 
 /*
@@ -367,12 +413,10 @@ struct kvm_ioeventfd {
  */
 #define KVM_GET_VCPU_MMAP_SIZE    _IO(KVMIO,   0x04) /* in bytes */
 #define KVM_GET_SUPPORTED_CPUID   _IOWR(KVMIO, 0x05, struct kvm_cpuid2)
-/*
- * ioctls for kvm trace
- */
-#define KVM_TRACE_ENABLE          _IOW(KVMIO, 0x06, struct kvm_user_trace_setup)
-#define KVM_TRACE_PAUSE           _IO(KVMIO,  0x07)
-#define KVM_TRACE_DISABLE         _IO(KVMIO,  0x08)
+#define KVM_TRACE_ENABLE          __KVM_DEPRECATED_MAIN_W_0x06
+#define KVM_TRACE_PAUSE           __KVM_DEPRECATED_MAIN_0x07
+#define KVM_TRACE_DISABLE         __KVM_DEPRECATED_MAIN_0x08
+
 /*
  * Extension capability list.
  */
@@ -522,56 +566,57 @@ struct kvm_clock_data {
 /*
  * ioctls for VM fds
  */
-#define KVM_SET_MEMORY_REGION     _IOW(KVMIO, 0x40, struct kvm_memory_region)
+#define KVM_SET_MEMORY_REGION     _IOW(KVMIO,  0x40, struct kvm_memory_region)
 /*
  * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns
  * a vcpu fd.
  */
-#define KVM_CREATE_VCPU           _IO(KVMIO,  0x41)
-#define KVM_GET_DIRTY_LOG         _IOW(KVMIO, 0x42, struct kvm_dirty_log)
-#define KVM_SET_MEMORY_ALIAS      _IOW(KVMIO, 0x43, struct kvm_memory_alias)
-#define KVM_SET_NR_MMU_PAGES      _IO(KVMIO, 0x44)
-#define KVM_GET_NR_MMU_PAGES      _IO(KVMIO, 0x45)
-#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\
+#define KVM_CREATE_VCPU           _IO(KVMIO,   0x41)
+#define KVM_GET_DIRTY_LOG         _IOW(KVMIO,  0x42, struct kvm_dirty_log)
+#define KVM_SET_MEMORY_ALIAS      _IOW(KVMIO,  0x43, struct kvm_memory_alias)
+#define KVM_SET_NR_MMU_PAGES      _IO(KVMIO,   0x44)
+#define KVM_GET_NR_MMU_PAGES      _IO(KVMIO,   0x45)
+#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46, \
 					struct kvm_userspace_memory_region)
-#define KVM_SET_TSS_ADDR          _IO(KVMIO, 0x47)
-#define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64)
+#define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
+#define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
 /* Device model IOC */
-#define KVM_CREATE_IRQCHIP	  _IO(KVMIO,  0x60)
-#define KVM_IRQ_LINE		  _IOW(KVMIO, 0x61, struct kvm_irq_level)
-#define KVM_GET_IRQCHIP		  _IOWR(KVMIO, 0x62, struct kvm_irqchip)
-#define KVM_SET_IRQCHIP		  _IOR(KVMIO,  0x63, struct kvm_irqchip)
-#define KVM_CREATE_PIT		  _IO(KVMIO,  0x64)
-#define KVM_GET_PIT		  _IOWR(KVMIO, 0x65, struct kvm_pit_state)
-#define KVM_SET_PIT		  _IOR(KVMIO,  0x66, struct kvm_pit_state)
-#define KVM_IRQ_LINE_STATUS	  _IOWR(KVMIO, 0x67, struct kvm_irq_level)
+#define KVM_CREATE_IRQCHIP        _IO(KVMIO,   0x60)
+#define KVM_IRQ_LINE              _IOW(KVMIO,  0x61, struct kvm_irq_level)
+#define KVM_GET_IRQCHIP           _IOWR(KVMIO, 0x62, struct kvm_irqchip)
+#define KVM_SET_IRQCHIP           _IOR(KVMIO,  0x63, struct kvm_irqchip)
+#define KVM_CREATE_PIT            _IO(KVMIO,   0x64)
+#define KVM_GET_PIT               _IOWR(KVMIO, 0x65, struct kvm_pit_state)
+#define KVM_SET_PIT               _IOR(KVMIO,  0x66, struct kvm_pit_state)
+#define KVM_IRQ_LINE_STATUS       _IOWR(KVMIO, 0x67, struct kvm_irq_level)
 #define KVM_REGISTER_COALESCED_MMIO \
 			_IOW(KVMIO,  0x67, struct kvm_coalesced_mmio_zone)
 #define KVM_UNREGISTER_COALESCED_MMIO \
 			_IOW(KVMIO,  0x68, struct kvm_coalesced_mmio_zone)
-#define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \
-				   struct kvm_assigned_pci_dev)
-#define KVM_SET_GSI_ROUTING       _IOW(KVMIO, 0x6a, struct kvm_irq_routing)
+#define KVM_ASSIGN_PCI_DEVICE     _IOR(KVMIO,  0x69, \
+				       struct kvm_assigned_pci_dev)
+#define KVM_SET_GSI_ROUTING       _IOW(KVMIO,  0x6a, struct kvm_irq_routing)
 /* deprecated, replaced by KVM_ASSIGN_DEV_IRQ */
-#define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \
-			    struct kvm_assigned_irq)
-#define KVM_ASSIGN_DEV_IRQ        _IOW(KVMIO, 0x70, struct kvm_assigned_irq)
-#define KVM_REINJECT_CONTROL      _IO(KVMIO, 0x71)
-#define KVM_DEASSIGN_PCI_DEVICE _IOW(KVMIO, 0x72, \
-				     struct kvm_assigned_pci_dev)
-#define KVM_ASSIGN_SET_MSIX_NR \
-			_IOW(KVMIO, 0x73, struct kvm_assigned_msix_nr)
-#define KVM_ASSIGN_SET_MSIX_ENTRY \
-			_IOW(KVMIO, 0x74, struct kvm_assigned_msix_entry)
-#define KVM_DEASSIGN_DEV_IRQ       _IOW(KVMIO, 0x75, struct kvm_assigned_irq)
-#define KVM_IRQFD                  _IOW(KVMIO, 0x76, struct kvm_irqfd)
-#define KVM_CREATE_PIT2		   _IOW(KVMIO, 0x77, struct kvm_pit_config)
-#define KVM_SET_BOOT_CPU_ID        _IO(KVMIO, 0x78)
-#define KVM_IOEVENTFD             _IOW(KVMIO, 0x79, struct kvm_ioeventfd)
-#define KVM_XEN_HVM_CONFIG        _IOW(KVMIO, 0x7a, struct kvm_xen_hvm_config)
-#define KVM_SET_CLOCK             _IOW(KVMIO, 0x7b, struct kvm_clock_data)
-#define KVM_GET_CLOCK             _IOR(KVMIO, 0x7c, struct kvm_clock_data)
-
+#define KVM_ASSIGN_IRQ            __KVM_DEPRECATED_VM_R_0x70
+#define KVM_ASSIGN_DEV_IRQ        _IOW(KVMIO,  0x70, struct kvm_assigned_irq)
+#define KVM_REINJECT_CONTROL      _IO(KVMIO,   0x71)
+#define KVM_DEASSIGN_PCI_DEVICE   _IOW(KVMIO,  0x72, \
+				       struct kvm_assigned_pci_dev)
+#define KVM_ASSIGN_SET_MSIX_NR    _IOW(KVMIO,  0x73, \
+				       struct kvm_assigned_msix_nr)
+#define KVM_ASSIGN_SET_MSIX_ENTRY _IOW(KVMIO,  0x74, \
+				       struct kvm_assigned_msix_entry)
+#define KVM_DEASSIGN_DEV_IRQ      _IOW(KVMIO,  0x75, struct kvm_assigned_irq)
+#define KVM_IRQFD                 _IOW(KVMIO,  0x76, struct kvm_irqfd)
+#define KVM_CREATE_PIT2		  _IOW(KVMIO,  0x77, struct kvm_pit_config)
+#define KVM_SET_BOOT_CPU_ID       _IO(KVMIO,   0x78)
+#define KVM_IOEVENTFD             _IOW(KVMIO,  0x79, struct kvm_ioeventfd)
+#define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct kvm_xen_hvm_config)
+#define KVM_SET_CLOCK             _IOW(KVMIO,  0x7b, struct kvm_clock_data)
+#define KVM_GET_CLOCK             _IOR(KVMIO,  0x7c, struct kvm_clock_data)
+/* Available with KVM_CAP_PIT_STATE2 */
+#define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
+#define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
 
 /*
  * ioctls for vcpu fds
@@ -584,7 +629,7 @@ struct kvm_clock_data {
 #define KVM_TRANSLATE             _IOWR(KVMIO, 0x85, struct kvm_translation)
 #define KVM_INTERRUPT             _IOW(KVMIO,  0x86, struct kvm_interrupt)
 /* KVM_DEBUG_GUEST is no longer supported, use KVM_SET_GUEST_DEBUG instead */
-#define KVM_DEBUG_GUEST           __KVM_DEPRECATED_DEBUG_GUEST
+#define KVM_DEBUG_GUEST           __KVM_DEPRECATED_VCPU_W_0x87
 #define KVM_GET_MSRS              _IOWR(KVMIO, 0x88, struct kvm_msrs)
 #define KVM_SET_MSRS              _IOW(KVMIO,  0x89, struct kvm_msrs)
 #define KVM_SET_CPUID             _IOW(KVMIO,  0x8a, struct kvm_cpuid)
@@ -596,7 +641,7 @@ struct kvm_clock_data {
 #define KVM_SET_CPUID2            _IOW(KVMIO,  0x90, struct kvm_cpuid2)
 #define KVM_GET_CPUID2            _IOWR(KVMIO, 0x91, struct kvm_cpuid2)
 /* Available with KVM_CAP_VAPIC */
-#define KVM_TPR_ACCESS_REPORTING  _IOWR(KVMIO,  0x92, struct kvm_tpr_access_ctl)
+#define KVM_TPR_ACCESS_REPORTING  _IOWR(KVMIO, 0x92, struct kvm_tpr_access_ctl)
 /* Available with KVM_CAP_VAPIC */
 #define KVM_SET_VAPIC_ADDR        _IOW(KVMIO,  0x93, struct kvm_vapic_addr)
 /* valid for virtual machine (for floating interrupt)_and_ vcpu */
@@ -608,67 +653,21 @@ struct kvm_clock_data {
 /* initial ipl psw for s390 */
 #define KVM_S390_SET_INITIAL_PSW  _IOW(KVMIO,  0x96, struct kvm_s390_psw)
 /* initial reset for s390 */
-#define KVM_S390_INITIAL_RESET    _IO(KVMIO,  0x97)
+#define KVM_S390_INITIAL_RESET    _IO(KVMIO,   0x97)
 #define KVM_GET_MP_STATE          _IOR(KVMIO,  0x98, struct kvm_mp_state)
 #define KVM_SET_MP_STATE          _IOW(KVMIO,  0x99, struct kvm_mp_state)
 /* Available with KVM_CAP_NMI */
-#define KVM_NMI                   _IO(KVMIO,  0x9a)
+#define KVM_NMI                   _IO(KVMIO,   0x9a)
 /* Available with KVM_CAP_SET_GUEST_DEBUG */
 #define KVM_SET_GUEST_DEBUG       _IOW(KVMIO,  0x9b, struct kvm_guest_debug)
 /* MCE for x86 */
 #define KVM_X86_SETUP_MCE         _IOW(KVMIO,  0x9c, __u64)
 #define KVM_X86_GET_MCE_CAP_SUPPORTED _IOR(KVMIO,  0x9d, __u64)
 #define KVM_X86_SET_MCE           _IOW(KVMIO,  0x9e, struct kvm_x86_mce)
-
-/*
- * Deprecated interfaces
- */
-struct kvm_breakpoint {
-	__u32 enabled;
-	__u32 padding;
-	__u64 address;
-};
-
-struct kvm_debug_guest {
-	__u32 enabled;
-	__u32 pad;
-	struct kvm_breakpoint breakpoints[4];
-	__u32 singlestep;
-};
-
-#define __KVM_DEPRECATED_DEBUG_GUEST _IOW(KVMIO,  0x87, struct kvm_debug_guest)
-
+/* IA64 stack access */
 #define KVM_IA64_VCPU_GET_STACK   _IOR(KVMIO,  0x9a, void *)
 #define KVM_IA64_VCPU_SET_STACK   _IOW(KVMIO,  0x9b, void *)
 
-#define KVM_GET_PIT2   _IOR(KVMIO,   0x9f, struct kvm_pit_state2)
-#define KVM_SET_PIT2   _IOW(KVMIO,   0xa0, struct kvm_pit_state2)
-
-#define KVM_TRC_INJ_VIRQ         (KVM_TRC_HANDLER + 0x02)
-#define KVM_TRC_REDELIVER_EVT    (KVM_TRC_HANDLER + 0x03)
-#define KVM_TRC_PEND_INTR        (KVM_TRC_HANDLER + 0x04)
-#define KVM_TRC_IO_READ          (KVM_TRC_HANDLER + 0x05)
-#define KVM_TRC_IO_WRITE         (KVM_TRC_HANDLER + 0x06)
-#define KVM_TRC_CR_READ          (KVM_TRC_HANDLER + 0x07)
-#define KVM_TRC_CR_WRITE         (KVM_TRC_HANDLER + 0x08)
-#define KVM_TRC_DR_READ          (KVM_TRC_HANDLER + 0x09)
-#define KVM_TRC_DR_WRITE         (KVM_TRC_HANDLER + 0x0A)
-#define KVM_TRC_MSR_READ         (KVM_TRC_HANDLER + 0x0B)
-#define KVM_TRC_MSR_WRITE        (KVM_TRC_HANDLER + 0x0C)
-#define KVM_TRC_CPUID            (KVM_TRC_HANDLER + 0x0D)
-#define KVM_TRC_INTR             (KVM_TRC_HANDLER + 0x0E)
-#define KVM_TRC_NMI              (KVM_TRC_HANDLER + 0x0F)
-#define KVM_TRC_VMMCALL          (KVM_TRC_HANDLER + 0x10)
-#define KVM_TRC_HLT              (KVM_TRC_HANDLER + 0x11)
-#define KVM_TRC_CLTS             (KVM_TRC_HANDLER + 0x12)
-#define KVM_TRC_LMSW             (KVM_TRC_HANDLER + 0x13)
-#define KVM_TRC_APIC_ACCESS      (KVM_TRC_HANDLER + 0x14)
-#define KVM_TRC_TDP_FAULT        (KVM_TRC_HANDLER + 0x15)
-#define KVM_TRC_GTLB_WRITE       (KVM_TRC_HANDLER + 0x16)
-#define KVM_TRC_STLB_WRITE       (KVM_TRC_HANDLER + 0x17)
-#define KVM_TRC_STLB_INVAL       (KVM_TRC_HANDLER + 0x18)
-#define KVM_TRC_PPC_INSTR        (KVM_TRC_HANDLER + 0x19)
-
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 
 struct kvm_assigned_pci_dev {
@@ -722,4 +721,4 @@ struct kvm_assigned_msix_entry {
 	__u16 padding[3];
 };
 
-#endif
+#endif /* __LINUX_KVM_H */
-- 
cgit v1.2.3


From a9c7399d6cda0a092b347f8ee49bbe44f6e1fe66 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 4 Nov 2009 11:54:59 +0200
Subject: KVM: Allow internal errors reported to userspace to carry extra data

Usually userspace will freeze the guest so we can inspect it, but some
internal state is not available.  Add extra data to internal error
reporting so we can expose it to the debugger.  Extra data is specific
to the suberror.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c  | 1 +
 arch/x86/kvm/vmx.c  | 1 +
 include/linux/kvm.h | 4 ++++
 virt/kvm/kvm_main.c | 1 +
 4 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a9024797b21f..4c3e5b2314cb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2800,6 +2800,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
 	case EMULATE_FAIL:
 		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+		vcpu->run->internal.ndata = 0;
 		return 0;
 	default:
 		BUG();
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c9cc9596e1a6..c0e66dd58a47 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3352,6 +3352,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 			kvm_report_emulation_failure(vcpu, "emulation failure");
 			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 			vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+			vcpu->run->internal.ndata = 0;
 			ret = 0;
 			goto out;
 		}
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ca62b8e056f9..172639e94392 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -251,6 +251,9 @@ struct kvm_run {
 		} dcr;
 		struct {
 			__u32 suberror;
+			/* Available with KVM_CAP_INTERNAL_ERROR_DATA: */
+			__u32 ndata;
+			__u64 data[16];
 		} internal;
 		/* Fix the size of the union. */
 		char padding[256];
@@ -484,6 +487,7 @@ struct kvm_ioeventfd {
 #define KVM_CAP_XEN_HVM 38
 #endif
 #define KVM_CAP_ADJUST_CLOCK 39
+#define KVM_CAP_INTERNAL_ERROR_DATA 40
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index bd44fb48ac43..f92ba138007a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1653,6 +1653,7 @@ static long kvm_dev_ioctl_check_extension_generic(long arg)
 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
 	case KVM_CAP_SET_BOOT_CPU_ID:
 #endif
+	case KVM_CAP_INTERNAL_ERROR_DATA:
 		return 1;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
 	case KVM_CAP_IRQ_ROUTING:
-- 
cgit v1.2.3


From 65ac7264043740572ba804edca03c374d70427c9 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 4 Nov 2009 11:59:01 +0200
Subject: KVM: VMX: Report unexpected simultaneous exceptions as internal
 errors

These happen when we trap an exception when another exception is being
delivered; we only expect these with MCEs and page faults.  If something
unexpected happens, things probably went south and we're better off reporting
an internal error and freezing.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c  | 11 ++++++++---
 include/linux/kvm.h |  1 +
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c0e66dd58a47..22fcd27a0b58 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2744,9 +2744,14 @@ static int handle_exception(struct kvm_vcpu *vcpu)
 		return handle_machine_check(vcpu);
 
 	if ((vect_info & VECTORING_INFO_VALID_MASK) &&
-						!is_page_fault(intr_info))
-		printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
-		       "intr info 0x%x\n", __func__, vect_info, intr_info);
+	    !is_page_fault(intr_info)) {
+		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
+		vcpu->run->internal.ndata = 2;
+		vcpu->run->internal.data[0] = vect_info;
+		vcpu->run->internal.data[1] = intr_info;
+		return 0;
+	}
 
 	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
 		return 1;  /* already handled by vmx_vcpu_run() */
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 172639e94392..976f4d181858 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -163,6 +163,7 @@ struct kvm_pit_config {
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
+#define KVM_INTERNAL_ERROR_SIMUL_EX 2
 
 /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
 struct kvm_run {
-- 
cgit v1.2.3


From 3cfc3092f40bc37c57ba556cfd8de4218f2135ab Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@web.de>
Date: Thu, 12 Nov 2009 01:04:25 +0100
Subject: KVM: x86: Add KVM_GET/SET_VCPU_EVENTS

This new IOCTL exports all yet user-invisible states related to
exceptions, interrupts, and NMIs. Together with appropriate user space
changes, this fixes sporadic problems of vmsave/restore, live migration
and system reset.

[avi: future-proof abi by adding a flags field]

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/kvm/api.txt       | 49 ++++++++++++++++++++++++++
 arch/x86/include/asm/kvm.h      | 28 +++++++++++++++
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/svm.c              | 22 ++++++++++++
 arch/x86/kvm/vmx.c              | 30 ++++++++++++++++
 arch/x86/kvm/x86.c              | 77 +++++++++++++++++++++++++++++++++++++++++
 include/linux/kvm.h             |  6 ++++
 7 files changed, 214 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index 36594ba57723..e1a114161027 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -653,6 +653,55 @@ struct kvm_clock_data {
 	__u32 pad[9];
 };
 
+4.29 KVM_GET_VCPU_EVENTS
+
+Capability: KVM_CAP_VCPU_EVENTS
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_vcpu_event (out)
+Returns: 0 on success, -1 on error
+
+Gets currently pending exceptions, interrupts, and NMIs as well as related
+states of the vcpu.
+
+struct kvm_vcpu_events {
+	struct {
+		__u8 injected;
+		__u8 nr;
+		__u8 has_error_code;
+		__u8 pad;
+		__u32 error_code;
+	} exception;
+	struct {
+		__u8 injected;
+		__u8 nr;
+		__u8 soft;
+		__u8 pad;
+	} interrupt;
+	struct {
+		__u8 injected;
+		__u8 pending;
+		__u8 masked;
+		__u8 pad;
+	} nmi;
+	__u32 sipi_vector;
+	__u32 flags;   /* must be zero */
+};
+
+4.30 KVM_SET_VCPU_EVENTS
+
+Capability: KVM_CAP_VCPU_EVENTS
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_vcpu_event (in)
+Returns: 0 on success, -1 on error
+
+Set pending exceptions, interrupts, and NMIs as well as related states of the
+vcpu.
+
+See KVM_GET_VCPU_EVENTS for the data structure.
+
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index ef9b4b73cce4..950df434763f 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -20,6 +20,7 @@
 #define __KVM_HAVE_MCE
 #define __KVM_HAVE_PIT_STATE2
 #define __KVM_HAVE_XEN_HVM
+#define __KVM_HAVE_VCPU_EVENTS
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
@@ -252,4 +253,31 @@ struct kvm_reinject_control {
 	__u8 pit_reinject;
 	__u8 reserved[31];
 };
+
+/* for KVM_GET/SET_VCPU_EVENTS */
+struct kvm_vcpu_events {
+	struct {
+		__u8 injected;
+		__u8 nr;
+		__u8 has_error_code;
+		__u8 pad;
+		__u32 error_code;
+	} exception;
+	struct {
+		__u8 injected;
+		__u8 nr;
+		__u8 soft;
+		__u8 pad;
+	} interrupt;
+	struct {
+		__u8 injected;
+		__u8 pending;
+		__u8 masked;
+		__u8 pad;
+	} nmi;
+	__u32 sipi_vector;
+	__u32 flags;
+	__u32 reserved[10];
+};
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 26a74b7bb6bc..06e085614dad 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -523,6 +523,8 @@ struct kvm_x86_ops {
 				bool has_error_code, u32 error_code);
 	int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
 	int (*nmi_allowed)(struct kvm_vcpu *vcpu);
+	bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
+	void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
 	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
 	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
 	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 34b700f9e498..3de0b37ec038 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2499,6 +2499,26 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
 		!(svm->vcpu.arch.hflags & HF_NMI_MASK);
 }
 
+static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
+}
+
+static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (masked) {
+		svm->vcpu.arch.hflags |= HF_NMI_MASK;
+		svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
+	} else {
+		svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
+		svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
+	}
+}
+
 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -2946,6 +2966,8 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.queue_exception = svm_queue_exception,
 	.interrupt_allowed = svm_interrupt_allowed,
 	.nmi_allowed = svm_nmi_allowed,
+	.get_nmi_mask = svm_get_nmi_mask,
+	.set_nmi_mask = svm_set_nmi_mask,
 	.enable_nmi_window = enable_nmi_window,
 	.enable_irq_window = enable_irq_window,
 	.update_cr8_intercept = update_cr8_intercept,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 22fcd27a0b58..778f059ae423 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2639,6 +2639,34 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 				GUEST_INTR_STATE_NMI));
 }
 
+static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+	if (!cpu_has_virtual_nmis())
+		return to_vmx(vcpu)->soft_vnmi_blocked;
+	else
+		return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+			  GUEST_INTR_STATE_NMI);
+}
+
+static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!cpu_has_virtual_nmis()) {
+		if (vmx->soft_vnmi_blocked != masked) {
+			vmx->soft_vnmi_blocked = masked;
+			vmx->vnmi_blocked_time = 0;
+		}
+	} else {
+		if (masked)
+			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+				      GUEST_INTR_STATE_NMI);
+		else
+			vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+					GUEST_INTR_STATE_NMI);
+	}
+}
+
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
 	return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
@@ -3985,6 +4013,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.queue_exception = vmx_queue_exception,
 	.interrupt_allowed = vmx_interrupt_allowed,
 	.nmi_allowed = vmx_nmi_allowed,
+	.get_nmi_mask = vmx_get_nmi_mask,
+	.set_nmi_mask = vmx_set_nmi_mask,
 	.enable_nmi_window = enable_nmi_window,
 	.enable_irq_window = enable_irq_window,
 	.update_cr8_intercept = update_cr8_intercept,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ba8958dca3c4..35eea30821d6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1342,6 +1342,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_SET_IDENTITY_MAP_ADDR:
 	case KVM_CAP_XEN_HVM:
 	case KVM_CAP_ADJUST_CLOCK:
+	case KVM_CAP_VCPU_EVENTS:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -1883,6 +1884,61 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
+					       struct kvm_vcpu_events *events)
+{
+	vcpu_load(vcpu);
+
+	events->exception.injected = vcpu->arch.exception.pending;
+	events->exception.nr = vcpu->arch.exception.nr;
+	events->exception.has_error_code = vcpu->arch.exception.has_error_code;
+	events->exception.error_code = vcpu->arch.exception.error_code;
+
+	events->interrupt.injected = vcpu->arch.interrupt.pending;
+	events->interrupt.nr = vcpu->arch.interrupt.nr;
+	events->interrupt.soft = vcpu->arch.interrupt.soft;
+
+	events->nmi.injected = vcpu->arch.nmi_injected;
+	events->nmi.pending = vcpu->arch.nmi_pending;
+	events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
+
+	events->sipi_vector = vcpu->arch.sipi_vector;
+
+	events->flags = 0;
+
+	vcpu_put(vcpu);
+}
+
+static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
+					      struct kvm_vcpu_events *events)
+{
+	if (events->flags)
+		return -EINVAL;
+
+	vcpu_load(vcpu);
+
+	vcpu->arch.exception.pending = events->exception.injected;
+	vcpu->arch.exception.nr = events->exception.nr;
+	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
+	vcpu->arch.exception.error_code = events->exception.error_code;
+
+	vcpu->arch.interrupt.pending = events->interrupt.injected;
+	vcpu->arch.interrupt.nr = events->interrupt.nr;
+	vcpu->arch.interrupt.soft = events->interrupt.soft;
+	if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
+		kvm_pic_clear_isr_ack(vcpu->kvm);
+
+	vcpu->arch.nmi_injected = events->nmi.injected;
+	vcpu->arch.nmi_pending = events->nmi.pending;
+	kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
+
+	vcpu->arch.sipi_vector = events->sipi_vector;
+
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 {
@@ -2040,6 +2096,27 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
 		break;
 	}
+	case KVM_GET_VCPU_EVENTS: {
+		struct kvm_vcpu_events events;
+
+		kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
+
+		r = -EFAULT;
+		if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
+			break;
+		r = 0;
+		break;
+	}
+	case KVM_SET_VCPU_EVENTS: {
+		struct kvm_vcpu_events events;
+
+		r = -EFAULT;
+		if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
+			break;
+
+		r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
+		break;
+	}
 	default:
 		r = -EINVAL;
 	}
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 976f4d181858..92045a92d714 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -489,6 +489,9 @@ struct kvm_ioeventfd {
 #endif
 #define KVM_CAP_ADJUST_CLOCK 39
 #define KVM_CAP_INTERNAL_ERROR_DATA 40
+#ifdef __KVM_HAVE_VCPU_EVENTS
+#define KVM_CAP_VCPU_EVENTS 41
+#endif
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -672,6 +675,9 @@ struct kvm_clock_data {
 /* IA64 stack access */
 #define KVM_IA64_VCPU_GET_STACK   _IOR(KVMIO,  0x9a, void *)
 #define KVM_IA64_VCPU_SET_STACK   _IOW(KVMIO,  0x9b, void *)
+/* Available with KVM_CAP_VCPU_EVENTS */
+#define KVM_GET_VCPU_EVENTS       _IOR(KVMIO,  0x9f, struct kvm_vcpu_events)
+#define KVM_SET_VCPU_EVENTS       _IOW(KVMIO,  0xa0, struct kvm_vcpu_events)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 
-- 
cgit v1.2.3


From d7b0b5eb3000c6fb902f08c619fcd673a23d8fab Mon Sep 17 00:00:00 2001
From: Carsten Otte <carsteno@de.ibm.com>
Date: Thu, 19 Nov 2009 14:21:16 +0100
Subject: KVM: s390: Make psw available on all exits, not just a subset

This patch moves s390 processor status word into the base kvm_run
struct and keeps it up-to date on all userspace exits.

The userspace ABI is broken by this, however there are no applications
in the wild using this.  A capability check is provided so users can
verify the updated API exists.

Cc: stable@kernel.org
Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/s390/include/asm/kvm.h |  3 ++-
 arch/s390/kvm/kvm-s390.c    | 25 +++++++++++++++++--------
 include/linux/kvm.h         |  8 ++++++--
 3 files changed, 25 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/include/asm/kvm.h b/arch/s390/include/asm/kvm.h
index 3dfcaeb5d7f4..82b32a100c7d 100644
--- a/arch/s390/include/asm/kvm.h
+++ b/arch/s390/include/asm/kvm.h
@@ -1,6 +1,5 @@
 #ifndef __LINUX_KVM_S390_H
 #define __LINUX_KVM_S390_H
-
 /*
  * asm-s390/kvm.h - KVM s390 specific structures and definitions
  *
@@ -15,6 +14,8 @@
  */
 #include <linux/types.h>
 
+#define __KVM_S390
+
 /* for KVM_GET_REGS and KVM_SET_REGS */
 struct kvm_regs {
 	/* general purpose regs for s390 */
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 544505893c9f..f8bcaefd7d34 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -117,10 +117,16 @@ long kvm_arch_dev_ioctl(struct file *filp,
 
 int kvm_dev_ioctl_check_extension(long ext)
 {
+	int r;
+
 	switch (ext) {
+	case KVM_CAP_S390_PSW:
+		r = 1;
+		break;
 	default:
-		return 0;
+		r = 0;
 	}
+	return r;
 }
 
 /* Section: vm related */
@@ -420,8 +426,10 @@ static int kvm_arch_vcpu_ioctl_set_initial_psw(struct kvm_vcpu *vcpu, psw_t psw)
 	vcpu_load(vcpu);
 	if (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_RUNNING)
 		rc = -EBUSY;
-	else
-		vcpu->arch.sie_block->gpsw = psw;
+	else {
+		vcpu->run->psw_mask = psw.mask;
+		vcpu->run->psw_addr = psw.addr;
+	}
 	vcpu_put(vcpu);
 	return rc;
 }
@@ -509,9 +517,6 @@ rerun_vcpu:
 
 	switch (kvm_run->exit_reason) {
 	case KVM_EXIT_S390_SIEIC:
-		vcpu->arch.sie_block->gpsw.mask = kvm_run->s390_sieic.mask;
-		vcpu->arch.sie_block->gpsw.addr = kvm_run->s390_sieic.addr;
-		break;
 	case KVM_EXIT_UNKNOWN:
 	case KVM_EXIT_INTR:
 	case KVM_EXIT_S390_RESET:
@@ -520,6 +525,9 @@ rerun_vcpu:
 		BUG();
 	}
 
+	vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask;
+	vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr;
+
 	might_fault();
 
 	do {
@@ -539,8 +547,6 @@ rerun_vcpu:
 		/* intercept cannot be handled in-kernel, prepare kvm-run */
 		kvm_run->exit_reason         = KVM_EXIT_S390_SIEIC;
 		kvm_run->s390_sieic.icptcode = vcpu->arch.sie_block->icptcode;
-		kvm_run->s390_sieic.mask     = vcpu->arch.sie_block->gpsw.mask;
-		kvm_run->s390_sieic.addr     = vcpu->arch.sie_block->gpsw.addr;
 		kvm_run->s390_sieic.ipa      = vcpu->arch.sie_block->ipa;
 		kvm_run->s390_sieic.ipb      = vcpu->arch.sie_block->ipb;
 		rc = 0;
@@ -552,6 +558,9 @@ rerun_vcpu:
 		rc = 0;
 	}
 
+	kvm_run->psw_mask     = vcpu->arch.sie_block->gpsw.mask;
+	kvm_run->psw_addr     = vcpu->arch.sie_block->gpsw.addr;
+
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 92045a92d714..2d241da07236 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -181,6 +181,11 @@ struct kvm_run {
 	__u64 cr8;
 	__u64 apic_base;
 
+#ifdef __KVM_S390
+	/* the processor status word for s390 */
+	__u64 psw_mask; /* psw upper half */
+	__u64 psw_addr; /* psw lower half */
+#endif
 	union {
 		/* KVM_EXIT_UNKNOWN */
 		struct {
@@ -232,8 +237,6 @@ struct kvm_run {
 		/* KVM_EXIT_S390_SIEIC */
 		struct {
 			__u8 icptcode;
-			__u64 mask; /* psw upper half */
-			__u64 addr; /* psw lower half */
 			__u16 ipa;
 			__u32 ipb;
 		} s390_sieic;
@@ -492,6 +495,7 @@ struct kvm_ioeventfd {
 #ifdef __KVM_HAVE_VCPU_EVENTS
 #define KVM_CAP_VCPU_EVENTS 41
 #endif
+#define KVM_CAP_S390_PSW 42
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From 6013efd8860bf15c1f86f365332642cfe557152f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 19 Nov 2009 15:36:45 +0900
Subject: libata: retry failed FLUSH if device didn't fail it

If ATA device failed FLUSH, it means that the device failed to write
out some amount of data and the error needs to be reported to upper
layers. As retries can't recover the lost data, FLUSH failures need to
be reported immediately in general.

However, if FLUSH fails due to transmission errors, the FLUSH needs to
be retried; otherwise, filesystems may switch to RO mode and/or raid
array may drop a drive for a random transmission glitch.

This condition can be rather easily reproduced on certain ahci
controllers which go through a PHY event after powersave mode switch +
ext4 combination.  Powersave mode switch is often closely followed by
flush from the filesystem failing the FLUSH with ATA bus error which
makes the filesystem code believe that data is lost and drop to RO
mode.  This was reported in the following bugzilla bug.

  http://bugzilla.kernel.org/show_bug.cgi?id=14543

This patch makes libata EH retry FLUSH if it wasn't failed by the
device.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Andrey Vihrov <andrey.vihrov@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/libata-eh.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/libata.h  |  2 +-
 2 files changed, 95 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index bba2ae5df1c2..0ea97c942ced 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -110,6 +110,13 @@ static const unsigned long ata_eh_identify_timeouts[] = {
 	ULONG_MAX,
 };
 
+static const unsigned long ata_eh_flush_timeouts[] = {
+	15000,	/* be generous with flush */
+	15000,  /* ditto */
+	30000,	/* and even more generous */
+	ULONG_MAX,
+};
+
 static const unsigned long ata_eh_other_timeouts[] = {
 	 5000,	/* same rationale as identify timeout */
 	10000,	/* ditto */
@@ -147,6 +154,8 @@ ata_eh_cmd_timeout_table[ATA_EH_CMD_TIMEOUT_TABLE_SIZE] = {
 	  .timeouts = ata_eh_other_timeouts, },
 	{ .commands = CMDS(ATA_CMD_INIT_DEV_PARAMS),
 	  .timeouts = ata_eh_other_timeouts, },
+	{ .commands = CMDS(ATA_CMD_FLUSH, ATA_CMD_FLUSH_EXT),
+	  .timeouts = ata_eh_flush_timeouts },
 };
 #undef CMDS
 
@@ -3112,6 +3121,82 @@ static int atapi_eh_clear_ua(struct ata_device *dev)
 	return 0;
 }
 
+/**
+ *	ata_eh_maybe_retry_flush - Retry FLUSH if necessary
+ *	@dev: ATA device which may need FLUSH retry
+ *
+ *	If @dev failed FLUSH, it needs to be reported upper layer
+ *	immediately as it means that @dev failed to remap and already
+ *	lost at least a sector and further FLUSH retrials won't make
+ *	any difference to the lost sector.  However, if FLUSH failed
+ *	for other reasons, for example transmission error, FLUSH needs
+ *	to be retried.
+ *
+ *	This function determines whether FLUSH failure retry is
+ *	necessary and performs it if so.
+ *
+ *	RETURNS:
+ *	0 if EH can continue, -errno if EH needs to be repeated.
+ */
+static int ata_eh_maybe_retry_flush(struct ata_device *dev)
+{
+	struct ata_link *link = dev->link;
+	struct ata_port *ap = link->ap;
+	struct ata_queued_cmd *qc;
+	struct ata_taskfile tf;
+	unsigned int err_mask;
+	int rc = 0;
+
+	/* did flush fail for this device? */
+	if (!ata_tag_valid(link->active_tag))
+		return 0;
+
+	qc = __ata_qc_from_tag(ap, link->active_tag);
+	if (qc->dev != dev || (qc->tf.command != ATA_CMD_FLUSH_EXT &&
+			       qc->tf.command != ATA_CMD_FLUSH))
+		return 0;
+
+	/* if the device failed it, it should be reported to upper layers */
+	if (qc->err_mask & AC_ERR_DEV)
+		return 0;
+
+	/* flush failed for some other reason, give it another shot */
+	ata_tf_init(dev, &tf);
+
+	tf.command = qc->tf.command;
+	tf.flags |= ATA_TFLAG_DEVICE;
+	tf.protocol = ATA_PROT_NODATA;
+
+	ata_dev_printk(dev, KERN_WARNING, "retrying FLUSH 0x%x Emask 0x%x\n",
+		       tf.command, qc->err_mask);
+
+	err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0);
+	if (!err_mask) {
+		/*
+		 * FLUSH is complete but there's no way to
+		 * successfully complete a failed command from EH.
+		 * Making sure retry is allowed at least once and
+		 * retrying it should do the trick - whatever was in
+		 * the cache is already on the platter and this won't
+		 * cause infinite loop.
+		 */
+		qc->scsicmd->allowed = max(qc->scsicmd->allowed, 1);
+	} else {
+		ata_dev_printk(dev, KERN_WARNING, "FLUSH failed Emask 0x%x\n",
+			       err_mask);
+		rc = -EIO;
+
+		/* if device failed it, report it to upper layers */
+		if (err_mask & AC_ERR_DEV) {
+			qc->err_mask |= AC_ERR_DEV;
+			qc->result_tf = tf;
+			if (!(ap->pflags & ATA_PFLAG_FROZEN))
+				rc = 0;
+		}
+	}
+	return rc;
+}
+
 static int ata_link_nr_enabled(struct ata_link *link)
 {
 	struct ata_device *dev;
@@ -3455,6 +3540,15 @@ int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
 			}
 		}
 
+		/* retry flush if necessary */
+		ata_for_each_dev(dev, link, ALL) {
+			if (dev->class != ATA_DEV_ATA)
+				continue;
+			rc = ata_eh_maybe_retry_flush(dev);
+			if (rc)
+				goto dev_fail;
+		}
+
 		/* configure link power saving */
 		if (ehc->i.action & ATA_EH_LPM)
 			ata_for_each_dev(dev, link, ALL)
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 87698640c091..ba07e84c9840 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -365,7 +365,7 @@ enum {
 	/* This should match the actual table size of
 	 * ata_eh_cmd_timeout_table in libata-eh.c.
 	 */
-	ATA_EH_CMD_TIMEOUT_TABLE_SIZE = 5,
+	ATA_EH_CMD_TIMEOUT_TABLE_SIZE = 6,
 
 	/* Horkage types. May be set by libata or controller on drives
 	   (some horkage may be drive/controller pair dependant */
-- 
cgit v1.2.3


From 18f0f97850059303ed73b1f02084f55ca330a80c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 17 Nov 2009 10:00:47 -0500
Subject: libata: add translation for SCSI WRITE SAME (aka TRIM support)

Add support for the ATA TRIM command in libata.  We translate a WRITE SAME 16
command with the unmap bit set into an ATA TRIM command and export enough
information in READ CAPACITY 16 and the block limits EVPD page so that the new
SCSI layer discard support will driver this for us.

Note that I hardcode the WRITE_SAME_16 opcode for now as the patch to introduce
the symbolic is not in 2.6.32 yet but only in the SCSI tree - as soon as it is
merged we can fix it up to properly use the symbolic name.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/libata-scsi.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/ata.h       | 13 +++++++
 2 files changed, 111 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 512a3ee8caf6..340a616d226b 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -47,6 +47,7 @@
 #include <linux/hdreg.h>
 #include <linux/uaccess.h>
 #include <linux/suspend.h>
+#include <asm/unaligned.h>
 
 #include "libata.h"
 
@@ -1963,6 +1964,7 @@ static unsigned int ata_scsiop_inq_00(struct ata_scsi_args *args, u8 *rbuf)
 		0x80,	/* page 0x80, unit serial no page */
 		0x83,	/* page 0x83, device ident page */
 		0x89,	/* page 0x89, ata info page */
+		0xb0,	/* page 0xb0, block limits page */
 		0xb1,	/* page 0xb1, block device characteristics page */
 	};
 
@@ -2084,6 +2086,41 @@ static unsigned int ata_scsiop_inq_89(struct ata_scsi_args *args, u8 *rbuf)
 	return 0;
 }
 
+static unsigned int ata_scsiop_inq_b0(struct ata_scsi_args *args, u8 *rbuf)
+{
+	u32 min_io_sectors;
+
+	rbuf[1] = 0xb0;
+	rbuf[3] = 0x3c;		/* required VPD size with unmap support */
+
+	/*
+	 * Optimal transfer length granularity.
+	 *
+	 * This is always one physical block, but for disks with a smaller
+	 * logical than physical sector size we need to figure out what the
+	 * latter is.
+	 */
+	if (ata_id_has_large_logical_sectors(args->id))
+		min_io_sectors = ata_id_logical_per_physical_sectors(args->id);
+	else
+		min_io_sectors = 1;
+	put_unaligned_be16(min_io_sectors, &rbuf[6]);
+
+	/*
+	 * Optimal unmap granularity.
+	 *
+	 * The ATA spec doesn't even know about a granularity or alignment
+	 * for the TRIM command.  We can leave away most of the unmap related
+	 * VPD page entries, but we have specifify a granularity to signal
+	 * that we support some form of unmap - in thise case via WRITE SAME
+	 * with the unmap bit set.
+	 */
+	if (ata_id_has_trim(args->id))
+		put_unaligned_be32(1, &rbuf[28]);
+
+	return 0;
+}
+
 static unsigned int ata_scsiop_inq_b1(struct ata_scsi_args *args, u8 *rbuf)
 {
 	int form_factor = ata_id_form_factor(args->id);
@@ -2373,6 +2410,9 @@ static unsigned int ata_scsiop_read_cap(struct ata_scsi_args *args, u8 *rbuf)
 		rbuf[13] = log_per_phys;
 		rbuf[14] = (lowest_aligned >> 8) & 0x3f;
 		rbuf[15] = lowest_aligned;
+
+		if (ata_id_has_trim(args->id))
+			rbuf[14] |= 0x80;
 	}
 
 	return 0;
@@ -2895,6 +2935,58 @@ static unsigned int ata_scsi_pass_thru(struct ata_queued_cmd *qc)
 	return 1;
 }
 
+static unsigned int ata_scsi_write_same_xlat(struct ata_queued_cmd *qc)
+{
+	struct ata_taskfile *tf = &qc->tf;
+	struct scsi_cmnd *scmd = qc->scsicmd;
+	struct ata_device *dev = qc->dev;
+	const u8 *cdb = scmd->cmnd;
+	u64 block;
+	u32 n_block;
+	u32 size;
+	void *buf;
+
+	/* we may not issue DMA commands if no DMA mode is set */
+	if (unlikely(!dev->dma_mode))
+		goto invalid_fld;
+
+	if (unlikely(scmd->cmd_len < 16))
+		goto invalid_fld;
+	scsi_16_lba_len(cdb, &block, &n_block);
+
+	/* for now we only support WRITE SAME with the unmap bit set */
+	if (unlikely(!(cdb[1] & 0x8)))
+		goto invalid_fld;
+
+	/*
+	 * WRITE SAME always has a sector sized buffer as payload, this
+	 * should never be a multiple entry S/G list.
+	 */
+	if (!scsi_sg_count(scmd))
+		goto invalid_fld;
+
+	buf = page_address(sg_page(scsi_sglist(scmd)));
+	size = ata_set_lba_range_entries(buf, 512 / 8, block, n_block);
+
+	tf->protocol = ATA_PROT_DMA;
+	tf->hob_feature = 0;
+	tf->feature = ATA_DSM_TRIM;
+	tf->hob_nsect = (size / 512) >> 8;
+	tf->nsect = size / 512;
+	tf->command = ATA_CMD_DSM;
+	tf->flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE | ATA_TFLAG_LBA48 |
+		     ATA_TFLAG_WRITE;
+
+	ata_qc_set_pc_nbytes(qc);
+
+	return 0;
+
+ invalid_fld:
+	ata_scsi_set_sense(scmd, ILLEGAL_REQUEST, 0x24, 0x00);
+	/* "Invalid field in cdb" */
+	return 1;
+}
+
 /**
  *	ata_get_xlat_func - check if SCSI to ATA translation is possible
  *	@dev: ATA device
@@ -2919,6 +3011,9 @@ static inline ata_xlat_func_t ata_get_xlat_func(struct ata_device *dev, u8 cmd)
 	case WRITE_16:
 		return ata_scsi_rw_xlat;
 
+	case 0x93 /*WRITE_SAME_16*/:
+		return ata_scsi_write_same_xlat;
+
 	case SYNCHRONIZE_CACHE:
 		if (ata_try_flush_cache(dev))
 			return ata_scsi_flush_xlat;
@@ -3108,6 +3203,9 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd,
 		case 0x89:
 			ata_scsi_rbuf_fill(&args, ata_scsiop_inq_89);
 			break;
+		case 0xb0:
+			ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b0);
+			break;
 		case 0xb1:
 			ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b1);
 			break;
diff --git a/include/linux/ata.h b/include/linux/ata.h
index 4fb357312b3b..e2595e877e44 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -87,6 +87,7 @@ enum {
 	ATA_ID_HW_CONFIG	= 93,
 	ATA_ID_SPG		= 98,
 	ATA_ID_LBA_CAPACITY_2	= 100,
+	ATA_ID_SECTOR_SIZE	= 106,
 	ATA_ID_LAST_LUN		= 126,
 	ATA_ID_DLF		= 128,
 	ATA_ID_CSFO		= 129,
@@ -638,6 +639,18 @@ static inline int ata_id_flush_ext_enabled(const u16 *id)
 	return (id[ATA_ID_CFS_ENABLE_2] & 0x2400) == 0x2400;
 }
 
+static inline int ata_id_has_large_logical_sectors(const u16 *id)
+{
+	if ((id[ATA_ID_SECTOR_SIZE] & 0xc000) != 0x4000)
+		return 0;
+	return id[ATA_ID_SECTOR_SIZE] & (1 << 13);
+}
+
+static inline u8 ata_id_logical_per_physical_sectors(const u16 *id)
+{
+	return id[ATA_ID_SECTOR_SIZE] & 0xf;
+}
+
 static inline int ata_id_has_lba48(const u16 *id)
 {
 	if ((id[ATA_ID_COMMAND_SET_2] & 0xC000) != 0x4000)
-- 
cgit v1.2.3


From 98262f2762f0067375f83824d81ea929e37e6bfe Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Thu, 3 Dec 2009 09:24:48 +0100
Subject: block: Allow devices to indicate whether discarded blocks are zeroed

The discard ioctl is used by mkfs utilities to clear a block device
prior to putting metadata down.  However, not all devices return zeroed
blocks after a discard.  Some drives return stale data, potentially
containing old superblocks.  It is therefore important to know whether
discarded blocks are properly zeroed.

Both ATA and SCSI drives have configuration bits that indicate whether
zeroes are returned after a discard operation.  Implement a block level
interface that allows this information to be bubbled up the stack and
queried via a new block device ioctl.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c   |  2 ++
 block/blk-sysfs.c      | 11 +++++++++++
 block/compat_ioctl.c   |  2 ++
 block/ioctl.c          |  2 ++
 include/linux/blkdev.h | 14 ++++++++++++++
 include/linux/fs.h     |  1 +
 6 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 1ebc1fdb9144..dd1f1e0e196f 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -101,6 +101,7 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->discard_granularity = 0;
 	lim->discard_alignment = 0;
 	lim->discard_misaligned = 0;
+	lim->discard_zeroes_data = -1;
 	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
 	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
 	lim->alignment_offset = 0;
@@ -544,6 +545,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 
 	t->io_min = max(t->io_min, b->io_min);
 	t->no_cluster |= b->no_cluster;
+	t->discard_zeroes_data &= b->discard_zeroes_data;
 
 	/* Bottom device offset aligned? */
 	if (offset &&
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 3147145edc15..8606c9543fdd 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -136,6 +136,11 @@ static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
 	return queue_var_show(q->limits.max_discard_sectors << 9, page);
 }
 
+static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_discard_zeroes_data(q), page);
+}
+
 static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 {
@@ -313,6 +318,11 @@ static struct queue_sysfs_entry queue_discard_max_entry = {
 	.show = queue_discard_max_show,
 };
 
+static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
+	.attr = {.name = "discard_zeroes_data", .mode = S_IRUGO },
+	.show = queue_discard_zeroes_data_show,
+};
+
 static struct queue_sysfs_entry queue_nonrot_entry = {
 	.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_nonrot_show,
@@ -350,6 +360,7 @@ static struct attribute *default_attrs[] = {
 	&queue_io_opt_entry.attr,
 	&queue_discard_granularity_entry.attr,
 	&queue_discard_max_entry.attr,
+	&queue_discard_zeroes_data_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 9bd086c1a4d5..4eb8e9ea4af5 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -747,6 +747,8 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 		return compat_put_uint(arg, bdev_io_opt(bdev));
 	case BLKALIGNOFF:
 		return compat_put_int(arg, bdev_alignment_offset(bdev));
+	case BLKDISCARDZEROES:
+		return compat_put_uint(arg, bdev_discard_zeroes_data(bdev));
 	case BLKFLSBUF:
 	case BLKROSET:
 	case BLKDISCARD:
diff --git a/block/ioctl.c b/block/ioctl.c
index 1f4d1de12b09..be48ea51faee 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -280,6 +280,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		return put_uint(arg, bdev_io_opt(bdev));
 	case BLKALIGNOFF:
 		return put_int(arg, bdev_alignment_offset(bdev));
+	case BLKDISCARDZEROES:
+		return put_uint(arg, bdev_discard_zeroes_data(bdev));
 	case BLKSECTGET:
 		return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
 	case BLKRASET:
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e727f6c44c44..784a919aa0d0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -322,6 +322,7 @@ struct queue_limits {
 	unsigned char		misaligned;
 	unsigned char		discard_misaligned;
 	unsigned char		no_cluster;
+	signed char		discard_zeroes_data;
 };
 
 struct request_queue
@@ -1150,6 +1151,19 @@ static inline int queue_sector_discard_alignment(struct request_queue *q,
 		& (q->limits.discard_granularity - 1);
 }
 
+static inline unsigned int queue_discard_zeroes_data(struct request_queue *q)
+{
+	if (q->limits.discard_zeroes_data == 1)
+		return 1;
+
+	return 0;
+}
+
+static inline unsigned int bdev_discard_zeroes_data(struct block_device *bdev)
+{
+	return queue_discard_zeroes_data(bdev_get_queue(bdev));
+}
+
 static inline int queue_dma_alignment(struct request_queue *q)
 {
 	return q ? q->dma_alignment : 511;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 79cea8051736..891f7d642e5c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -304,6 +304,7 @@ struct inodes_stat_t {
 #define BLKIOOPT _IO(0x12,121)
 #define BLKALIGNOFF _IO(0x12,122)
 #define BLKPBSZGET _IO(0x12,123)
+#define BLKDISCARDZEROES _IO(0x12,124)
 
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */
-- 
cgit v1.2.3


From 796bd9524731850967d437b7f47a86acc776ea89 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 29 Sep 2009 12:27:23 +0100
Subject: VFS: Add forget_all_cached_acls()

This is required for cluster filesystems which want to use
cached ACLs so that they can invalidate the cache when
required.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Alexander Viro <aviro@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
---
 include/linux/posix_acl.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 065a3652a3ea..67608161df6b 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -147,6 +147,20 @@ static inline void forget_cached_acl(struct inode *inode, int type)
 	if (old != ACL_NOT_CACHED)
 		posix_acl_release(old);
 }
+
+static inline void forget_all_cached_acls(struct inode *inode)
+{
+	struct posix_acl *old_access, *old_default;
+	spin_lock(&inode->i_lock);
+	old_access = inode->i_acl;
+	old_default = inode->i_default_acl;
+	inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
+	spin_unlock(&inode->i_lock);
+	if (old_access != ACL_NOT_CACHED)
+		posix_acl_release(old_access);
+	if (old_default != ACL_NOT_CACHED)
+		posix_acl_release(old_default);
+}
 #endif
 
 static inline void cache_no_acl(struct inode *inode)
-- 
cgit v1.2.3


From 86e931a35e93d94e6e91b57cc76456e16d188ea9 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 28 Sep 2009 12:35:17 +0100
Subject: VFS: Export dquot_send_warning

Sending a message to userspace in a generic format to warn
of events (e.g. quota exceeded) in the quota subsystem is
a generically useful feature. This patch makes some minor
changes to the send_message function from dquot.c renaming
it quota_send_message, moving it to quota.c and exporting it
for use by filesystems which do not use the dquot code.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/quota/Kconfig      |  2 +-
 fs/quota/dquot.c      | 93 +++++----------------------------------------------
 fs/quota/quota.c      | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/quota.h | 11 ++++++
 4 files changed, 114 insertions(+), 85 deletions(-)

(limited to 'include/linux')

diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 8047e01ef46b..353e78a9ebee 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -17,7 +17,7 @@ config QUOTA
 
 config QUOTA_NETLINK_INTERFACE
 	bool "Report quota messages through netlink interface"
-	depends on QUOTA && NET
+	depends on QUOTACTL && NET
 	help
 	  If you say Y here, quota warnings (about exceeding softlimit, reaching
 	  hardlimit, etc.) will be reported through netlink interface. If unsure,
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 39b49c42a7ed..9b6ad908dcb2 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -77,10 +77,6 @@
 #include <linux/capability.h>
 #include <linux/quotaops.h>
 #include <linux/writeback.h> /* for inode_lock, oddly enough.. */
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-#include <net/netlink.h>
-#include <net/genetlink.h>
-#endif
 
 #include <asm/uaccess.h>
 
@@ -1071,73 +1067,6 @@ static void print_warning(struct dquot *dquot, const int warntype)
 }
 #endif
 
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-
-/* Netlink family structure for quota */
-static struct genl_family quota_genl_family = {
-	.id = GENL_ID_GENERATE,
-	.hdrsize = 0,
-	.name = "VFS_DQUOT",
-	.version = 1,
-	.maxattr = QUOTA_NL_A_MAX,
-};
-
-/* Send warning to userspace about user which exceeded quota */
-static void send_warning(const struct dquot *dquot, const char warntype)
-{
-	static atomic_t seq;
-	struct sk_buff *skb;
-	void *msg_head;
-	int ret;
-	int msg_size = 4 * nla_total_size(sizeof(u32)) +
-		       2 * nla_total_size(sizeof(u64));
-
-	/* We have to allocate using GFP_NOFS as we are called from a
-	 * filesystem performing write and thus further recursion into
-	 * the fs to free some data could cause deadlocks. */
-	skb = genlmsg_new(msg_size, GFP_NOFS);
-	if (!skb) {
-		printk(KERN_ERR
-		  "VFS: Not enough memory to send quota warning.\n");
-		return;
-	}
-	msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
-			&quota_genl_family, 0, QUOTA_NL_C_WARNING);
-	if (!msg_head) {
-		printk(KERN_ERR
-		  "VFS: Cannot store netlink header in quota warning.\n");
-		goto err_out;
-	}
-	ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, dquot->dq_type);
-	if (ret)
-		goto attr_err_out;
-	ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, dquot->dq_id);
-	if (ret)
-		goto attr_err_out;
-	ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
-	if (ret)
-		goto attr_err_out;
-	ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR,
-		MAJOR(dquot->dq_sb->s_dev));
-	if (ret)
-		goto attr_err_out;
-	ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR,
-		MINOR(dquot->dq_sb->s_dev));
-	if (ret)
-		goto attr_err_out;
-	ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
-	if (ret)
-		goto attr_err_out;
-	genlmsg_end(skb, msg_head);
-
-	genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
-	return;
-attr_err_out:
-	printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
-err_out:
-	kfree_skb(skb);
-}
-#endif
 /*
  * Write warnings to the console and send warning messages over netlink.
  *
@@ -1145,18 +1074,20 @@ err_out:
  */
 static void flush_warnings(struct dquot *const *dquots, char *warntype)
 {
+	struct dquot *dq;
 	int i;
 
-	for (i = 0; i < MAXQUOTAS; i++)
-		if (dquots[i] && warntype[i] != QUOTA_NL_NOWARN &&
-		    !warning_issued(dquots[i], warntype[i])) {
+	for (i = 0; i < MAXQUOTAS; i++) {
+		dq = dquots[i];
+		if (dq && warntype[i] != QUOTA_NL_NOWARN &&
+		    !warning_issued(dq, warntype[i])) {
 #ifdef CONFIG_PRINT_QUOTA_WARNING
-			print_warning(dquots[i], warntype[i]);
-#endif
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-			send_warning(dquots[i], warntype[i]);
+			print_warning(dq, warntype[i]);
 #endif
+			quota_send_warning(dq->dq_type, dq->dq_id,
+					   dq->dq_sb->s_dev, warntype[i]);
 		}
+	}
 }
 
 static int ignore_hardlimit(struct dquot *dquot)
@@ -2607,12 +2538,6 @@ static int __init dquot_init(void)
 
 	register_shrinker(&dqcache_shrinker);
 
-#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
-	if (genl_register_family(&quota_genl_family) != 0)
-		printk(KERN_ERR
-		       "VFS: Failed to create quota netlink interface.\n");
-#endif
-
 	return 0;
 }
 module_init(dquot_init);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 95c5b42384b2..ee91e2756950 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -18,6 +18,8 @@
 #include <linux/capability.h>
 #include <linux/quotaops.h>
 #include <linux/types.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
 
 /* Check validity of generic quotactl commands */
 static int generic_quotactl_valid(struct super_block *sb, int type, int cmd,
@@ -525,3 +527,94 @@ asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
 	return ret;
 }
 #endif
+
+
+#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
+
+/* Netlink family structure for quota */
+static struct genl_family quota_genl_family = {
+	.id = GENL_ID_GENERATE,
+	.hdrsize = 0,
+	.name = "VFS_DQUOT",
+	.version = 1,
+	.maxattr = QUOTA_NL_A_MAX,
+};
+
+/**
+ * quota_send_warning - Send warning to userspace about exceeded quota
+ * @type: The quota type: USRQQUOTA, GRPQUOTA,...
+ * @id: The user or group id of the quota that was exceeded
+ * @dev: The device on which the fs is mounted (sb->s_dev)
+ * @warntype: The type of the warning: QUOTA_NL_...
+ *
+ * This can be used by filesystems (including those which don't use
+ * dquot) to send a message to userspace relating to quota limits.
+ *
+ */
+
+void quota_send_warning(short type, unsigned int id, dev_t dev,
+			const char warntype)
+{
+	static atomic_t seq;
+	struct sk_buff *skb;
+	void *msg_head;
+	int ret;
+	int msg_size = 4 * nla_total_size(sizeof(u32)) +
+		       2 * nla_total_size(sizeof(u64));
+
+	/* We have to allocate using GFP_NOFS as we are called from a
+	 * filesystem performing write and thus further recursion into
+	 * the fs to free some data could cause deadlocks. */
+	skb = genlmsg_new(msg_size, GFP_NOFS);
+	if (!skb) {
+		printk(KERN_ERR
+		  "VFS: Not enough memory to send quota warning.\n");
+		return;
+	}
+	msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq),
+			&quota_genl_family, 0, QUOTA_NL_C_WARNING);
+	if (!msg_head) {
+		printk(KERN_ERR
+		  "VFS: Cannot store netlink header in quota warning.\n");
+		goto err_out;
+	}
+	ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type);
+	if (ret)
+		goto attr_err_out;
+	ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id);
+	if (ret)
+		goto attr_err_out;
+	ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype);
+	if (ret)
+		goto attr_err_out;
+	ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev));
+	if (ret)
+		goto attr_err_out;
+	ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev));
+	if (ret)
+		goto attr_err_out;
+	ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
+	if (ret)
+		goto attr_err_out;
+	genlmsg_end(skb, msg_head);
+
+	genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
+	return;
+attr_err_out:
+	printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
+err_out:
+	kfree_skb(skb);
+}
+EXPORT_SYMBOL(quota_send_warning);
+
+static int __init quota_init(void)
+{
+	if (genl_register_family(&quota_genl_family) != 0)
+		printk(KERN_ERR
+		       "VFS: Failed to create quota netlink interface.\n");
+	return 0;
+};
+
+module_init(quota_init);
+#endif
+
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 78c48895b12a..ce9a9b2e5cd4 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -376,6 +376,17 @@ static inline unsigned int dquot_generic_flag(unsigned int flags, int type)
 	return flags >> _DQUOT_STATE_FLAGS;
 }
 
+#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
+extern void quota_send_warning(short type, unsigned int id, dev_t dev,
+			       const char warntype);
+#else
+static inline void quota_send_warning(short type, unsigned int id, dev_t dev,
+				      const char warntype)
+{
+	return;
+}
+#endif /* CONFIG_QUOTA_NETLINK_INTERFACE */
+
 struct quota_info {
 	unsigned int flags;			/* Flags for diskquotas on this device */
 	struct mutex dqio_mutex;		/* lock device while I/O in progress */
-- 
cgit v1.2.3


From 0ab7d13fcbd7ce1658c563e345990ba453719deb Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 6 Nov 2009 16:20:51 +0000
Subject: GFS2: Tag all metadata with jid

There are two spare field in the header common to all GFS2
metadata. One is just the right size to fit a journal id
in it, and this patch updates the journal code so that each
time a metadata block is modified, we tag it with the journal
id of the node which is performing the modification.

The reason for this is that it should make it much easier to
debug issues which arise if we can tell which node was the
last to modify a particular metadata block.

Since the field is updated before the block is written into
the journal, each journal should only contain metadata which
is tagged with its own journal id. The one exception to this
is the journal header block, which might have a different node's
id in it, if that journal was recovered by another node in the
cluster.

Thus each journal will contain a record of which nodes recovered
it, via the journal header.

The other field in the metadata header could potentially be
used to hold information about what kind of operation was
performed, but for the time being we just zero it on each
transaction so that if we use it for that in future, we'll
know that the information (where it exists) is reliable.

I did consider using the other field to hold the journal
sequence number, however since in GFS2's journaling we write
the modified data into the journal and not the original
data, this gives no information as to what action caused the
modification, so I think we can probably come up with a better
use for those 64 bits in the future.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c             | 2 --
 fs/gfs2/log.c               | 2 ++
 fs/gfs2/lops.c              | 4 ++++
 fs/gfs2/recovery.c          | 2 ++
 include/linux/gfs2_ondisk.h | 6 +++++-
 5 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6380cd9314b0..26ba2a4c4a2d 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -947,9 +947,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 
 	str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
 	str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI);
-	str->di_header.__pad0 = 0;
 	str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI);
-	str->di_header.__pad1 = 0;
 	str->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
 	str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
 	str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 13c6237c5f67..4511b08fc451 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -596,7 +596,9 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
 	memset(lh, 0, sizeof(struct gfs2_log_header));
 	lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
 	lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+	lh->lh_header.__pad0 = cpu_to_be64(0);
 	lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+	lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
 	lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++);
 	lh->lh_flags = cpu_to_be32(flags);
 	lh->lh_tail = cpu_to_be32(tail);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 9969ff062c5b..de97632ba32f 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -132,6 +132,7 @@ static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type)
 static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 {
 	struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
+	struct gfs2_meta_header *mh;
 	struct gfs2_trans *tr;
 
 	lock_buffer(bd->bd_bh);
@@ -148,6 +149,9 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 	set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
 	gfs2_meta_check(sdp, bd->bd_bh);
 	gfs2_pin(sdp, bd->bd_bh);
+	mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
+	mh->__pad0 = cpu_to_be64(0);
+	mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
 	sdp->sd_log_num_buf++;
 	list_add(&le->le_list, &sdp->sd_log_le_buf);
 	tr->tr_num_buf_new++;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 09fa31965576..4b9bece3d437 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -410,7 +410,9 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
 	memset(lh, 0, sizeof(struct gfs2_log_header));
 	lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
 	lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+	lh->lh_header.__pad0 = cpu_to_be64(0);
 	lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+	lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
 	lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1);
 	lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT);
 	lh->lh_blkno = cpu_to_be32(lblock);
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
index b80c88dedbbb..81f90a59cda6 100644
--- a/include/linux/gfs2_ondisk.h
+++ b/include/linux/gfs2_ondisk.h
@@ -81,7 +81,11 @@ struct gfs2_meta_header {
 	__be32 mh_type;
 	__be64 __pad0;		/* Was generation number in gfs1 */
 	__be32 mh_format;
-	__be32 __pad1;		/* Was incarnation number in gfs1 */
+	/* This union is to keep userspace happy */
+	union {
+		__be32 mh_jid;		/* Was incarnation number in gfs1 */
+		__be32 __pad1;
+	};
 };
 
 /*
-- 
cgit v1.2.3


From b17621fed6aa039387e35f9b4d34d98f213e5673 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@gmail.com>
Date: Thu, 3 Dec 2009 13:54:25 +0100
Subject: writeback: introduce wbc.for_background

It will lower the flush priority for NFS, and maybe more in future.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c         | 1 +
 fs/nfs/write.c            | 2 +-
 include/linux/writeback.h | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 0306c8e7d6b5..0793961f7699 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -738,6 +738,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		.sync_mode		= args->sync_mode,
 		.older_than_this	= NULL,
 		.for_kupdate		= args->for_kupdate,
+		.for_background		= args->for_background,
 		.range_cyclic		= args->range_cyclic,
 	};
 	unsigned long oldest_jif;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 53eb26c16b50..c84b5cc1a943 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -178,7 +178,7 @@ static int wb_priority(struct writeback_control *wbc)
 {
 	if (wbc->for_reclaim)
 		return FLUSH_HIGHPRI | FLUSH_STABLE;
-	if (wbc->for_kupdate)
+	if (wbc->for_kupdate || wbc->for_background)
 		return FLUSH_LOWPRI;
 	return 0;
 }
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 66ebddcff664..705f01fe413a 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -49,6 +49,7 @@ struct writeback_control {
 	unsigned nonblocking:1;		/* Don't get stuck on request queues */
 	unsigned encountered_congestion:1; /* An output: a queue is full */
 	unsigned for_kupdate:1;		/* A kupdate writeback */
+	unsigned for_background:1;	/* A background writeback */
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned more_io:1;		/* more io to be dispatched */
-- 
cgit v1.2.3


From 31e4c28d95e64f2d5d3c497a3ecf37c62de635b4 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 3 Dec 2009 12:59:42 -0500
Subject: blkio: Introduce blkio controller cgroup interface

o This is basic implementation of blkio controller cgroup interface. This is
  the common interface visible to user space and should be used by different
  IO control policies as we implement those.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/Kconfig                 |  13 ++++
 block/Kconfig.iosched         |   1 +
 block/Makefile                |   1 +
 block/blk-cgroup.c            | 177 ++++++++++++++++++++++++++++++++++++++++++
 block/blk-cgroup.h            |  58 ++++++++++++++
 include/linux/cgroup_subsys.h |   6 ++
 include/linux/iocontext.h     |   4 +
 7 files changed, 260 insertions(+)
 create mode 100644 block/blk-cgroup.c
 create mode 100644 block/blk-cgroup.h

(limited to 'include/linux')

diff --git a/block/Kconfig b/block/Kconfig
index 9be0b56eaee1..6ba1a8e3388b 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,6 +77,19 @@ config BLK_DEV_INTEGRITY
 	T10/SCSI Data Integrity Field or the T13/ATA External Path
 	Protection.  If in doubt, say N.
 
+config BLK_CGROUP
+	bool
+	depends on CGROUPS
+	default n
+	---help---
+	Generic block IO controller cgroup interface. This is the common
+	cgroup interface which should be used by various IO controlling
+	policies.
+
+	Currently, CFQ IO scheduler uses it to recognize task groups and
+	control disk bandwidth allocation (proportional time slice allocation)
+	to such task groups.
+
 endif # BLOCK
 
 config BLOCK_COMPAT
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 8bd105115a69..be0280deec29 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -23,6 +23,7 @@ config IOSCHED_DEADLINE
 
 config IOSCHED_CFQ
 	tristate "CFQ I/O scheduler"
+	select BLK_CGROUP
 	default y
 	---help---
 	  The CFQ I/O scheduler tries to distribute bandwidth equally
diff --git a/block/Makefile b/block/Makefile
index 7914108952f2..cb2d515ebd6e 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o
 
 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
+obj-$(CONFIG_BLK_CGROUP)	+= blk-cgroup.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
new file mode 100644
index 000000000000..4f6afd76ec59
--- /dev/null
+++ b/block/blk-cgroup.c
@@ -0,0 +1,177 @@
+/*
+ * Common Block IO controller cgroup interface
+ *
+ * Based on ideas and code from CFQ, CFS and BFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
+ * 	              Nauman Rafique <nauman@google.com>
+ */
+#include <linux/ioprio.h>
+#include "blk-cgroup.h"
+
+struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
+
+struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
+{
+	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
+			    struct blkio_cgroup, css);
+}
+
+void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
+				struct blkio_group *blkg, void *key)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&blkcg->lock, flags);
+	rcu_assign_pointer(blkg->key, key);
+	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
+	spin_unlock_irqrestore(&blkcg->lock, flags);
+}
+
+int blkiocg_del_blkio_group(struct blkio_group *blkg)
+{
+	/* Implemented later */
+	return 0;
+}
+
+/* called under rcu_read_lock(). */
+struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
+{
+	struct blkio_group *blkg;
+	struct hlist_node *n;
+	void *__key;
+
+	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
+		__key = blkg->key;
+		if (__key == key)
+			return blkg;
+	}
+
+	return NULL;
+}
+
+#define SHOW_FUNCTION(__VAR)						\
+static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
+				       struct cftype *cftype)		\
+{									\
+	struct blkio_cgroup *blkcg;					\
+									\
+	blkcg = cgroup_to_blkio_cgroup(cgroup);				\
+	return (u64)blkcg->__VAR;					\
+}
+
+SHOW_FUNCTION(weight);
+#undef SHOW_FUNCTION
+
+static int
+blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
+{
+	struct blkio_cgroup *blkcg;
+
+	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
+		return -EINVAL;
+
+	blkcg = cgroup_to_blkio_cgroup(cgroup);
+	blkcg->weight = (unsigned int)val;
+	return 0;
+}
+
+struct cftype blkio_files[] = {
+	{
+		.name = "weight",
+		.read_u64 = blkiocg_weight_read,
+		.write_u64 = blkiocg_weight_write,
+	},
+};
+
+static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+{
+	return cgroup_add_files(cgroup, subsys, blkio_files,
+				ARRAY_SIZE(blkio_files));
+}
+
+static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+
+	free_css_id(&blkio_subsys, &blkcg->css);
+	kfree(blkcg);
+}
+
+static struct cgroup_subsys_state *
+blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+{
+	struct blkio_cgroup *blkcg, *parent_blkcg;
+
+	if (!cgroup->parent) {
+		blkcg = &blkio_root_cgroup;
+		goto done;
+	}
+
+	/* Currently we do not support hierarchy deeper than two level (0,1) */
+	parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
+	if (css_depth(&parent_blkcg->css) > 0)
+		return ERR_PTR(-EINVAL);
+
+	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
+	if (!blkcg)
+		return ERR_PTR(-ENOMEM);
+
+	blkcg->weight = BLKIO_WEIGHT_DEFAULT;
+done:
+	spin_lock_init(&blkcg->lock);
+	INIT_HLIST_HEAD(&blkcg->blkg_list);
+
+	return &blkcg->css;
+}
+
+/*
+ * We cannot support shared io contexts, as we have no mean to support
+ * two tasks with the same ioc in two different groups without major rework
+ * of the main cic data structures.  For now we allow a task to change
+ * its cgroup only if it's the only owner of its ioc.
+ */
+static int blkiocg_can_attach(struct cgroup_subsys *subsys,
+				struct cgroup *cgroup, struct task_struct *tsk,
+				bool threadgroup)
+{
+	struct io_context *ioc;
+	int ret = 0;
+
+	/* task_lock() is needed to avoid races with exit_io_context() */
+	task_lock(tsk);
+	ioc = tsk->io_context;
+	if (ioc && atomic_read(&ioc->nr_tasks) > 1)
+		ret = -EINVAL;
+	task_unlock(tsk);
+
+	return ret;
+}
+
+static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
+				struct cgroup *prev, struct task_struct *tsk,
+				bool threadgroup)
+{
+	struct io_context *ioc;
+
+	task_lock(tsk);
+	ioc = tsk->io_context;
+	if (ioc)
+		ioc->cgroup_changed = 1;
+	task_unlock(tsk);
+}
+
+struct cgroup_subsys blkio_subsys = {
+	.name = "blkio",
+	.create = blkiocg_create,
+	.can_attach = blkiocg_can_attach,
+	.attach = blkiocg_attach,
+	.destroy = blkiocg_destroy,
+	.populate = blkiocg_populate,
+	.subsys_id = blkio_subsys_id,
+	.use_id = 1,
+};
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
new file mode 100644
index 000000000000..ba5703f69b42
--- /dev/null
+++ b/block/blk-cgroup.h
@@ -0,0 +1,58 @@
+#ifndef _BLK_CGROUP_H
+#define _BLK_CGROUP_H
+/*
+ * Common Block IO controller cgroup interface
+ *
+ * Based on ideas and code from CFQ, CFS and BFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
+ * 	              Nauman Rafique <nauman@google.com>
+ */
+
+#include <linux/cgroup.h>
+
+struct blkio_cgroup {
+	struct cgroup_subsys_state css;
+	unsigned int weight;
+	spinlock_t lock;
+	struct hlist_head blkg_list;
+};
+
+struct blkio_group {
+	/* An rcu protected unique identifier for the group */
+	void *key;
+	struct hlist_node blkcg_node;
+};
+
+#define BLKIO_WEIGHT_MIN	100
+#define BLKIO_WEIGHT_MAX	1000
+#define BLKIO_WEIGHT_DEFAULT	500
+
+#ifdef CONFIG_BLK_CGROUP
+extern struct blkio_cgroup blkio_root_cgroup;
+extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
+extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
+				struct blkio_group *blkg, void *key);
+extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
+extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
+						void *key);
+#else
+static inline struct blkio_cgroup *
+cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
+
+static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
+			struct blkio_group *blkg, void *key)
+{
+}
+
+static inline int
+blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
+
+static inline struct blkio_group *
+blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
+#endif
+#endif /* _BLK_CGROUP_H */
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 9c8d31bacf46..ccefff02b6cb 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -60,3 +60,9 @@ SUBSYS(net_cls)
 #endif
 
 /* */
+
+#ifdef CONFIG_BLK_CGROUP
+SUBSYS(blkio)
+#endif
+
+/* */
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index eb73632440f1..d61b0b8b5cd1 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -68,6 +68,10 @@ struct io_context {
 	unsigned short ioprio;
 	unsigned short ioprio_changed;
 
+#ifdef CONFIG_BLK_CGROUP
+	unsigned short cgroup_changed;
+#endif
+
 	/*
 	 * For request batching
 	 */
-- 
cgit v1.2.3


From 8e182a90f91456335756d2ce304ad470795d98e1 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Mon, 30 Nov 2009 13:23:11 +0000
Subject: pata_piccolo: Driver for old Toshiba chipsets

We were never able to get docs for this out of Toshiba for years. Dave
Barnes produced a NetBSD driver however and from that we can fill in the
needed tables.

As we correct the PCI identifiers a bit also update the old ide generic driver
at the same time so it stays compiling.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/Kconfig           |   9 +++
 drivers/ata/Makefile          |   1 +
 drivers/ata/ata_generic.c     |   5 +-
 drivers/ata/pata_piccolo.c    | 140 ++++++++++++++++++++++++++++++++++++++++++
 drivers/ide/ide-pci-generic.c |   3 +-
 include/linux/pci_ids.h       |   7 ++-
 6 files changed, 160 insertions(+), 5 deletions(-)
 create mode 100644 drivers/ata/pata_piccolo.c

(limited to 'include/linux')

diff --git a/drivers/ata/Kconfig b/drivers/ata/Kconfig
index a47ca75aa078..676f08b004b3 100644
--- a/drivers/ata/Kconfig
+++ b/drivers/ata/Kconfig
@@ -667,6 +667,15 @@ config PATA_SIS
 
 	  If unsure, say N.
 
+config PATA_TOSHIBA
+	tristate "Toshiba Piccolo support (Experimental)"
+	depends on PCI && EXPERIMENTAL
+	help
+	  Support for the Toshiba Piccolo controllers. Currently only the
+	  primary channel is supported by this driver.
+
+	  If unsure, say N.
+
 config PATA_VIA
 	tristate "VIA PATA support"
 	depends on PCI
diff --git a/drivers/ata/Makefile b/drivers/ata/Makefile
index 01e126f343b3..d909435e9d81 100644
--- a/drivers/ata/Makefile
+++ b/drivers/ata/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_PATA_RZ1000)	+= pata_rz1000.o
 obj-$(CONFIG_PATA_SC1200)	+= pata_sc1200.o
 obj-$(CONFIG_PATA_SERVERWORKS)	+= pata_serverworks.o
 obj-$(CONFIG_PATA_SIL680)	+= pata_sil680.o
+obj-$(CONFIG_PATA_TOSHIBA)	+= pata_piccolo.o
 obj-$(CONFIG_PATA_VIA)		+= pata_via.o
 obj-$(CONFIG_PATA_WINBOND)	+= pata_sl82c105.o
 obj-$(CONFIG_PATA_WINBOND_VLB)	+= pata_winbond.o
diff --git a/drivers/ata/ata_generic.c b/drivers/ata/ata_generic.c
index ecfd22b4f1ce..12e26c3c68e3 100644
--- a/drivers/ata/ata_generic.c
+++ b/drivers/ata/ata_generic.c
@@ -168,9 +168,12 @@ static struct pci_device_id ata_generic[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_VIA,    PCI_DEVICE_ID_VIA_82C561), },
 	{ PCI_DEVICE(PCI_VENDOR_ID_OPTI,   PCI_DEVICE_ID_OPTI_82C558), },
 	{ PCI_DEVICE(PCI_VENDOR_ID_CENATEK,PCI_DEVICE_ID_CENATEK_IDE), },
-	{ PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO), },
+#if !defined(CONFIG_PATA_TOSHIBA) && !defined(CONFIG_PATA_TOSHIBA_MODULE)
 	{ PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_1), },
 	{ PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_2),  },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_3),  },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_5),  },
+#endif	
 	/* Must come last. If you add entries adjust this table appropriately */
 	{ PCI_ANY_ID,		PCI_ANY_ID,			   PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_STORAGE_IDE << 8, 0xFFFFFF00UL, 1},
 	{ 0, },
diff --git a/drivers/ata/pata_piccolo.c b/drivers/ata/pata_piccolo.c
new file mode 100644
index 000000000000..bfe0180f3efa
--- /dev/null
+++ b/drivers/ata/pata_piccolo.c
@@ -0,0 +1,140 @@
+/*
+ *  pata_piccolo.c - Toshiba Piccolo PATA/SATA controller driver.
+ *
+ *  This is basically an update to ata_generic.c to add Toshiba Piccolo support
+ *  then split out to keep ata_generic "clean".
+ *
+ *  Copyright 2005 Red Hat Inc, all rights reserved.
+ *
+ *  Elements from ide/pci/generic.c
+ *	    Copyright (C) 2001-2002	Andre Hedrick <andre@linux-ide.org>
+ *	    Portions (C) Copyright 2002  Red Hat Inc <alan@redhat.com>
+ *
+ *  May be copied or modified under the terms of the GNU General Public License
+ *
+ *  The timing data tables/programming info are courtesy of the NetBSD driver
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <scsi/scsi_host.h>
+#include <linux/libata.h>
+
+#define DRV_NAME "pata_piccolo"
+#define DRV_VERSION "0.0.1"
+
+
+
+static void tosh_set_piomode(struct ata_port *ap, struct ata_device *adev)
+{
+	static const u16 pio[6] = {	/* For reg 0x50 low word & E088 */
+		0x0566, 0x0433, 0x0311, 0x0201, 0x0200, 0x0100
+	};
+	struct pci_dev *pdev = to_pci_dev(ap->host->dev);
+	u16 conf;
+	pci_read_config_word(pdev, 0x50, &conf);
+	conf &= 0xE088;
+	conf |= pio[adev->pio_mode - XFER_PIO_0];
+	pci_write_config_word(pdev, 0x50, conf);
+}
+
+static void tosh_set_dmamode(struct ata_port *ap, struct ata_device *adev)
+{
+	struct pci_dev *pdev = to_pci_dev(ap->host->dev);
+	u32 conf;
+	pci_read_config_dword(pdev, 0x5C, &conf);
+	conf &= 0x78FFE088;	/* Keep the other bits */
+	if (adev->dma_mode >= XFER_UDMA_0) {
+		int udma = adev->dma_mode - XFER_UDMA_0;
+		conf |= 0x80000000;
+		conf |= (udma + 2) << 28;
+		conf |= (2 - udma) * 0x111;	/* spread into three nibbles */
+	} else {
+		static const u32 mwdma[4] = {
+			0x0655, 0x0200, 0x0200, 0x0100
+		};
+		conf |= mwdma[adev->dma_mode - XFER_MW_DMA_0];
+	}
+	pci_write_config_dword(pdev, 0x5C, conf);
+}
+
+
+static struct scsi_host_template tosh_sht = {
+	ATA_BMDMA_SHT(DRV_NAME),
+};
+
+static struct ata_port_operations tosh_port_ops = {
+	.inherits	= &ata_bmdma_port_ops,
+	.cable_detect	= ata_cable_unknown,
+	.set_piomode	= tosh_set_piomode,
+	.set_dmamode	= tosh_set_dmamode
+};
+
+/**
+ *	ata_tosh_init		-	attach generic IDE
+ *	@dev: PCI device found
+ *	@id: match entry
+ *
+ *	Called each time a matching IDE interface is found. We check if the
+ *	interface is one we wish to claim and if so we perform any chip
+ *	specific hacks then let the ATA layer do the heavy lifting.
+ */
+
+static int ata_tosh_init_one(struct pci_dev *dev, const struct pci_device_id *id)
+{
+	static const struct ata_port_info info = {
+		.flags = ATA_FLAG_SLAVE_POSS,
+		.pio_mask = ATA_PIO5,
+		.mwdma_mask = ATA_MWDMA2,
+		.udma_mask = ATA_UDMA2,
+		.port_ops = &tosh_port_ops
+	};
+	const struct ata_port_info *ppi[] = { &info, &ata_dummy_port_info };
+	/* Just one port for the moment */
+	return ata_pci_sff_init_one(dev, ppi, &tosh_sht, NULL);
+}
+
+static struct pci_device_id ata_tosh[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_1), },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_2),  },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_3),  },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_5),  },
+	{ 0, },
+};
+
+static struct pci_driver ata_tosh_pci_driver = {
+	.name 		= DRV_NAME,
+	.id_table	= ata_tosh,
+	.probe 		= ata_tosh_init_one,
+	.remove		= ata_pci_remove_one,
+#ifdef CONFIG_PM
+	.suspend	= ata_pci_device_suspend,
+	.resume		= ata_pci_device_resume,
+#endif
+};
+
+static int __init ata_tosh_init(void)
+{
+	return pci_register_driver(&ata_tosh_pci_driver);
+}
+
+
+static void __exit ata_tosh_exit(void)
+{
+	pci_unregister_driver(&ata_tosh_pci_driver);
+}
+
+
+MODULE_AUTHOR("Alan Cox");
+MODULE_DESCRIPTION("Low level driver for Toshiba Piccolo ATA");
+MODULE_LICENSE("GPL");
+MODULE_DEVICE_TABLE(pci, ata_tosh);
+MODULE_VERSION(DRV_VERSION);
+
+module_init(ata_tosh_init);
+module_exit(ata_tosh_exit);
+
diff --git a/drivers/ide/ide-pci-generic.c b/drivers/ide/ide-pci-generic.c
index 39d4e01f5c9c..a743e68a8903 100644
--- a/drivers/ide/ide-pci-generic.c
+++ b/drivers/ide/ide-pci-generic.c
@@ -162,9 +162,10 @@ static const struct pci_device_id generic_pci_tbl[] = {
 #ifdef CONFIG_BLK_DEV_IDE_SATA
 	{ PCI_VDEVICE(VIA,	PCI_DEVICE_ID_VIA_8237_SATA),		 5 },
 #endif
-	{ PCI_VDEVICE(TOSHIBA,	PCI_DEVICE_ID_TOSHIBA_PICCOLO),		 4 },
 	{ PCI_VDEVICE(TOSHIBA,	PCI_DEVICE_ID_TOSHIBA_PICCOLO_1),	 4 },
 	{ PCI_VDEVICE(TOSHIBA,	PCI_DEVICE_ID_TOSHIBA_PICCOLO_2),	 4 },
+	{ PCI_VDEVICE(TOSHIBA,	PCI_DEVICE_ID_TOSHIBA_PICCOLO_3),	 4 },
+	{ PCI_VDEVICE(TOSHIBA,	PCI_DEVICE_ID_TOSHIBA_PICCOLO_5),	 4 },
 	{ PCI_VDEVICE(NETCELL,	PCI_DEVICE_ID_REVOLUTION),		 6 },
 	/*
 	 * Must come last.  If you add entries adjust
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 84cf1f3b7838..9ca483be68d5 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1496,9 +1496,10 @@
 #define PCI_DEVICE_ID_SBE_WANXL400	0x0104
 
 #define PCI_VENDOR_ID_TOSHIBA		0x1179
-#define PCI_DEVICE_ID_TOSHIBA_PICCOLO	0x0102
-#define PCI_DEVICE_ID_TOSHIBA_PICCOLO_1	0x0103
-#define PCI_DEVICE_ID_TOSHIBA_PICCOLO_2	0x0105
+#define PCI_DEVICE_ID_TOSHIBA_PICCOLO_1	0x0101
+#define PCI_DEVICE_ID_TOSHIBA_PICCOLO_2	0x0102
+#define PCI_DEVICE_ID_TOSHIBA_PICCOLO_3	0x0103
+#define PCI_DEVICE_ID_TOSHIBA_PICCOLO_5	0x0105
 #define PCI_DEVICE_ID_TOSHIBA_TOPIC95	0x060a
 #define PCI_DEVICE_ID_TOSHIBA_TOPIC97	0x060f
 #define PCI_DEVICE_ID_TOSHIBA_TOPIC100	0x0617
-- 
cgit v1.2.3


From 95514fd8ff0f30de7815950edfd84ef1e19fb1c8 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Wed, 25 Nov 2009 18:12:48 +0100
Subject: libata: add private driver field to struct ata_device

This brings struct ata_device in-line with struct ata_{port,host}.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 include/linux/libata.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index ba07e84c9840..a5b3dc71e819 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -595,6 +595,7 @@ struct ata_device {
 	unsigned int		horkage;	/* List of broken features */
 	unsigned long		flags;		/* ATA_DFLAG_xxx */
 	struct scsi_device	*sdev;		/* attached SCSI device */
+	void			*private_data;
 #ifdef CONFIG_ATA_ACPI
 	acpi_handle		acpi_handle;
 	union acpi_object	*gtf_cache;
-- 
cgit v1.2.3


From 491deb24bf5bf7124141287aaf02c3219783ceab Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 3 Dec 2009 01:25:54 +0000
Subject: net 02/05: fib_rules: rename ifindex/ifname/FRA_IFNAME to
 iifindex/iifname/FRA_IIFNAME

commit 229e77eec406ad68662f18e49fda8b5d366768c5
Author: Patrick McHardy <kaber@trash.net>
Date:   Thu Dec 3 12:05:23 2009 +0100

    net: fib_rules: rename ifindex/ifname/FRA_IFNAME to iifindex/iifname/FRA_IIFNAME

    The next patch will add oif classification, rename interface related members
    and attributes to reflect that they're used for iif classification.

    Signed-off-by: Patrick McHardy <kaber@trash.net>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/fib_rules.h |  6 ++++--
 include/net/fib_rules.h   |  6 +++---
 net/core/fib_rules.c      | 36 ++++++++++++++++++------------------
 3 files changed, 25 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fib_rules.h b/include/linux/fib_rules.h
index c7e5b700bb91..7e11bb2fa655 100644
--- a/include/linux/fib_rules.h
+++ b/include/linux/fib_rules.h
@@ -8,7 +8,8 @@
 #define FIB_RULE_PERMANENT	0x00000001
 #define FIB_RULE_INVERT		0x00000002
 #define FIB_RULE_UNRESOLVED	0x00000004
-#define FIB_RULE_DEV_DETACHED	0x00000008
+#define FIB_RULE_IIF_DETACHED	0x00000008
+#define FIB_RULE_DEV_DETACHED	FIB_RULE_IIF_DETACHED
 
 /* try to find source address in routing lookups */
 #define FIB_RULE_FIND_SADDR	0x00010000
@@ -31,7 +32,8 @@ enum {
 	FRA_UNSPEC,
 	FRA_DST,	/* destination address */
 	FRA_SRC,	/* source address */
-	FRA_IFNAME,	/* interface name */
+	FRA_IIFNAME,	/* interface name */
+#define FRA_IFNAME	FRA_IIFNAME
 	FRA_GOTO,	/* target to jump to (FR_ACT_GOTO) */
 	FRA_UNUSED2,
 	FRA_PRIORITY,	/* priority/preference */
diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 22fb323cd85e..62bebcb2a51c 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -10,7 +10,7 @@
 struct fib_rule {
 	struct list_head	list;
 	atomic_t		refcnt;
-	int			ifindex;
+	int			iifindex;
 	u32			mark;
 	u32			mark_mask;
 	u32			pref;
@@ -19,7 +19,7 @@ struct fib_rule {
 	u8			action;
 	u32			target;
 	struct fib_rule *	ctarget;
-	char			ifname[IFNAMSIZ];
+	char			iifname[IFNAMSIZ];
 	struct rcu_head		rcu;
 	struct net *		fr_net;
 };
@@ -67,7 +67,7 @@ struct fib_rules_ops {
 };
 
 #define FRA_GENERIC_POLICY \
-	[FRA_IFNAME]	= { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, \
+	[FRA_IIFNAME]	= { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, \
 	[FRA_PRIORITY]	= { .type = NLA_U32 }, \
 	[FRA_FWMARK]	= { .type = NLA_U32 }, \
 	[FRA_FWMASK]	= { .type = NLA_U32 }, \
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index bd309384f8b8..8e8028cdc87f 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -135,7 +135,7 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
 {
 	int ret = 0;
 
-	if (rule->ifindex && (rule->ifindex != fl->iif))
+	if (rule->iifindex && (rule->iifindex != fl->iif))
 		goto out;
 
 	if ((rule->mark ^ fl->mark) & rule->mark_mask)
@@ -248,14 +248,14 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 	if (tb[FRA_PRIORITY])
 		rule->pref = nla_get_u32(tb[FRA_PRIORITY]);
 
-	if (tb[FRA_IFNAME]) {
+	if (tb[FRA_IIFNAME]) {
 		struct net_device *dev;
 
-		rule->ifindex = -1;
-		nla_strlcpy(rule->ifname, tb[FRA_IFNAME], IFNAMSIZ);
-		dev = __dev_get_by_name(net, rule->ifname);
+		rule->iifindex = -1;
+		nla_strlcpy(rule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
+		dev = __dev_get_by_name(net, rule->iifname);
 		if (dev)
-			rule->ifindex = dev->ifindex;
+			rule->iifindex = dev->ifindex;
 	}
 
 	if (tb[FRA_FWMARK]) {
@@ -388,8 +388,8 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 		    (rule->pref != nla_get_u32(tb[FRA_PRIORITY])))
 			continue;
 
-		if (tb[FRA_IFNAME] &&
-		    nla_strcmp(tb[FRA_IFNAME], rule->ifname))
+		if (tb[FRA_IIFNAME] &&
+		    nla_strcmp(tb[FRA_IIFNAME], rule->iifname))
 			continue;
 
 		if (tb[FRA_FWMARK] &&
@@ -447,7 +447,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
 					 struct fib_rule *rule)
 {
 	size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr))
-			 + nla_total_size(IFNAMSIZ) /* FRA_IFNAME */
+			 + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */
 			 + nla_total_size(4) /* FRA_PRIORITY */
 			 + nla_total_size(4) /* FRA_TABLE */
 			 + nla_total_size(4) /* FRA_FWMARK */
@@ -481,11 +481,11 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
 	if (rule->action == FR_ACT_GOTO && rule->ctarget == NULL)
 		frh->flags |= FIB_RULE_UNRESOLVED;
 
-	if (rule->ifname[0]) {
-		NLA_PUT_STRING(skb, FRA_IFNAME, rule->ifname);
+	if (rule->iifname[0]) {
+		NLA_PUT_STRING(skb, FRA_IIFNAME, rule->iifname);
 
-		if (rule->ifindex == -1)
-			frh->flags |= FIB_RULE_DEV_DETACHED;
+		if (rule->iifindex == -1)
+			frh->flags |= FIB_RULE_IIF_DETACHED;
 	}
 
 	if (rule->pref)
@@ -600,9 +600,9 @@ static void attach_rules(struct list_head *rules, struct net_device *dev)
 	struct fib_rule *rule;
 
 	list_for_each_entry(rule, rules, list) {
-		if (rule->ifindex == -1 &&
-		    strcmp(dev->name, rule->ifname) == 0)
-			rule->ifindex = dev->ifindex;
+		if (rule->iifindex == -1 &&
+		    strcmp(dev->name, rule->iifname) == 0)
+			rule->iifindex = dev->ifindex;
 	}
 }
 
@@ -611,8 +611,8 @@ static void detach_rules(struct list_head *rules, struct net_device *dev)
 	struct fib_rule *rule;
 
 	list_for_each_entry(rule, rules, list)
-		if (rule->ifindex == dev->ifindex)
-			rule->ifindex = -1;
+		if (rule->iifindex == dev->ifindex)
+			rule->iifindex = -1;
 }
 
 
-- 
cgit v1.2.3


From 1b038a5e60c7812f19818e8a5df96d029e49c38f Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 3 Dec 2009 01:25:56 +0000
Subject: net 03/05: fib_rules: add oif classification

commit 68144d350f4f6c348659c825cde6a82b34c27a91
Author: Patrick McHardy <kaber@trash.net>
Date:   Thu Dec 3 12:05:25 2009 +0100

    net: fib_rules: add oif classification

    Support routing table lookup based on the flow's oif. This is useful to
    classify packets originating from sockets bound to interfaces differently.

    The route cache already includes the oif and needs no changes.

    Signed-off-by: Patrick McHardy <kaber@trash.net>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/fib_rules.h |  2 ++
 include/net/fib_rules.h   |  3 +++
 net/core/fib_rules.c      | 33 ++++++++++++++++++++++++++++++++-
 3 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fib_rules.h b/include/linux/fib_rules.h
index 7e11bb2fa655..51da65b68b85 100644
--- a/include/linux/fib_rules.h
+++ b/include/linux/fib_rules.h
@@ -10,6 +10,7 @@
 #define FIB_RULE_UNRESOLVED	0x00000004
 #define FIB_RULE_IIF_DETACHED	0x00000008
 #define FIB_RULE_DEV_DETACHED	FIB_RULE_IIF_DETACHED
+#define FIB_RULE_OIF_DETACHED	0x00000010
 
 /* try to find source address in routing lookups */
 #define FIB_RULE_FIND_SADDR	0x00010000
@@ -47,6 +48,7 @@ enum {
 	FRA_UNUSED8,
 	FRA_TABLE,	/* Extended table id */
 	FRA_FWMASK,	/* mask for netfilter mark */
+	FRA_OIFNAME,
 	__FRA_MAX
 };
 
diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 62bebcb2a51c..d4e875a58f8b 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -11,6 +11,7 @@ struct fib_rule {
 	struct list_head	list;
 	atomic_t		refcnt;
 	int			iifindex;
+	int			oifindex;
 	u32			mark;
 	u32			mark_mask;
 	u32			pref;
@@ -20,6 +21,7 @@ struct fib_rule {
 	u32			target;
 	struct fib_rule *	ctarget;
 	char			iifname[IFNAMSIZ];
+	char			oifname[IFNAMSIZ];
 	struct rcu_head		rcu;
 	struct net *		fr_net;
 };
@@ -68,6 +70,7 @@ struct fib_rules_ops {
 
 #define FRA_GENERIC_POLICY \
 	[FRA_IIFNAME]	= { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, \
+	[FRA_OIFNAME]	= { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, \
 	[FRA_PRIORITY]	= { .type = NLA_U32 }, \
 	[FRA_FWMARK]	= { .type = NLA_U32 }, \
 	[FRA_FWMASK]	= { .type = NLA_U32 }, \
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 8e8028cdc87f..d1a70ad4b544 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -138,6 +138,9 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
 	if (rule->iifindex && (rule->iifindex != fl->iif))
 		goto out;
 
+	if (rule->oifindex && (rule->oifindex != fl->oif))
+		goto out;
+
 	if ((rule->mark ^ fl->mark) & rule->mark_mask)
 		goto out;
 
@@ -258,6 +261,16 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 			rule->iifindex = dev->ifindex;
 	}
 
+	if (tb[FRA_OIFNAME]) {
+		struct net_device *dev;
+
+		rule->oifindex = -1;
+		nla_strlcpy(rule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
+		dev = __dev_get_by_name(net, rule->oifname);
+		if (dev)
+			rule->oifindex = dev->ifindex;
+	}
+
 	if (tb[FRA_FWMARK]) {
 		rule->mark = nla_get_u32(tb[FRA_FWMARK]);
 		if (rule->mark)
@@ -392,6 +405,10 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 		    nla_strcmp(tb[FRA_IIFNAME], rule->iifname))
 			continue;
 
+		if (tb[FRA_OIFNAME] &&
+		    nla_strcmp(tb[FRA_OIFNAME], rule->oifname))
+			continue;
+
 		if (tb[FRA_FWMARK] &&
 		    (rule->mark != nla_get_u32(tb[FRA_FWMARK])))
 			continue;
@@ -448,6 +465,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
 {
 	size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr))
 			 + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */
+			 + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */
 			 + nla_total_size(4) /* FRA_PRIORITY */
 			 + nla_total_size(4) /* FRA_TABLE */
 			 + nla_total_size(4) /* FRA_FWMARK */
@@ -488,6 +506,13 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
 			frh->flags |= FIB_RULE_IIF_DETACHED;
 	}
 
+	if (rule->oifname[0]) {
+		NLA_PUT_STRING(skb, FRA_OIFNAME, rule->oifname);
+
+		if (rule->oifindex == -1)
+			frh->flags |= FIB_RULE_OIF_DETACHED;
+	}
+
 	if (rule->pref)
 		NLA_PUT_U32(skb, FRA_PRIORITY, rule->pref);
 
@@ -603,6 +628,9 @@ static void attach_rules(struct list_head *rules, struct net_device *dev)
 		if (rule->iifindex == -1 &&
 		    strcmp(dev->name, rule->iifname) == 0)
 			rule->iifindex = dev->ifindex;
+		if (rule->oifindex == -1 &&
+		    strcmp(dev->name, rule->oifname) == 0)
+			rule->oifindex = dev->ifindex;
 	}
 }
 
@@ -610,9 +638,12 @@ static void detach_rules(struct list_head *rules, struct net_device *dev)
 {
 	struct fib_rule *rule;
 
-	list_for_each_entry(rule, rules, list)
+	list_for_each_entry(rule, rules, list) {
 		if (rule->iifindex == dev->ifindex)
 			rule->iifindex = -1;
+		if (rule->oifindex == dev->ifindex)
+			rule->oifindex = -1;
+	}
 }
 
 
-- 
cgit v1.2.3


From 8153a10c08f1312af563bb92532002e46d3f504a Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 3 Dec 2009 01:25:58 +0000
Subject: ipv4 05/05: add sysctl to accept packets with local source addresses

commit 8ec1e0ebe26087bfc5c0394ada5feb5758014fc8
Author: Patrick McHardy <kaber@trash.net>
Date:   Thu Dec 3 12:16:35 2009 +0100

    ipv4: add sysctl to accept packets with local source addresses

    Change fib_validate_source() to accept packets with a local source address when
    the "accept_local" sysctl is set for the incoming inet device. Combined with the
    previous patches, this allows to communicate between multiple local interfaces
    over the wire.

    Signed-off-by: Patrick McHardy <kaber@trash.net>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  6 ++++++
 include/linux/inetdevice.h             |  1 +
 include/linux/sysctl.h                 |  1 +
 kernel/sysctl_check.c                  |  1 +
 net/ipv4/devinet.c                     |  1 +
 net/ipv4/fib_frontend.c                | 11 +++++++----
 6 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 989f5538b8dd..006b39dec87d 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -731,6 +731,12 @@ accept_source_route - BOOLEAN
 	default TRUE (router)
 		FALSE (host)
 
+accept_local - BOOLEAN
+	Accept packets with local source addresses. In combination with
+	suitable routing, this can be used to direct packets between two
+	local interfaces over the wire and have them accepted properly.
+	default FALSE
+
 rp_filter - INTEGER
 	0 - No source validation.
 	1 - Strict mode as defined in RFC3704 Strict Reverse Path
diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index eecfa559bfb4..699e85c01a4d 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -83,6 +83,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
 #define IN_DEV_RPFILTER(in_dev)		IN_DEV_MAXCONF((in_dev), RP_FILTER)
 #define IN_DEV_SOURCE_ROUTE(in_dev)	IN_DEV_ANDCONF((in_dev), \
 						       ACCEPT_SOURCE_ROUTE)
+#define IN_DEV_ACCEPT_LOCAL(in_dev)	IN_DEV_ORCONF((in_dev), ACCEPT_LOCAL)
 #define IN_DEV_BOOTP_RELAY(in_dev)	IN_DEV_ANDCONF((in_dev), BOOTP_RELAY)
 
 #define IN_DEV_LOG_MARTIANS(in_dev)	IN_DEV_ORCONF((in_dev), LOG_MARTIANS)
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 1e4743ee6831..9f047d73a216 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -490,6 +490,7 @@ enum
 	NET_IPV4_CONF_PROMOTE_SECONDARIES=20,
 	NET_IPV4_CONF_ARP_ACCEPT=21,
 	NET_IPV4_CONF_ARP_NOTIFY=22,
+	NET_IPV4_CONF_ACCEPT_LOCAL=23,
 	__NET_IPV4_CONF_MAX
 };
 
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index b6e7aaea4604..f1d676e4b368 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -220,6 +220,7 @@ static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
 	{ NET_IPV4_CONF_PROMOTE_SECONDARIES,	"promote_secondaries" },
 	{ NET_IPV4_CONF_ARP_ACCEPT,		"arp_accept" },
 	{ NET_IPV4_CONF_ARP_NOTIFY,		"arp_notify" },
+	{ NET_IPV4_CONF_ACCEPT_LOCAL,		"accept_local" },
 	{}
 };
 
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index c100709d6ddf..e3126612fcbb 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1468,6 +1468,7 @@ static struct devinet_sysctl_table {
 		DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"),
 		DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE,
 					"accept_source_route"),
+		DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"),
 		DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"),
 		DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"),
 		DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"),
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 3b373a8b0473..3323168ee52d 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -241,16 +241,17 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
 			    .iif = oif };
 
 	struct fib_result res;
-	int no_addr, rpf;
+	int no_addr, rpf, accept_local;
 	int ret;
 	struct net *net;
 
-	no_addr = rpf = 0;
+	no_addr = rpf = accept_local = 0;
 	rcu_read_lock();
 	in_dev = __in_dev_get_rcu(dev);
 	if (in_dev) {
 		no_addr = in_dev->ifa_list == NULL;
 		rpf = IN_DEV_RPFILTER(in_dev);
+		accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
 	}
 	rcu_read_unlock();
 
@@ -260,8 +261,10 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
 	net = dev_net(dev);
 	if (fib_lookup(net, &fl, &res))
 		goto last_resort;
-	if (res.type != RTN_UNICAST)
-		goto e_inval_res;
+	if (res.type != RTN_UNICAST) {
+		if (res.type != RTN_LOCAL || !accept_local)
+			goto e_inval_res;
+	}
 	*spec_dst = FIB_RES_PREFSRC(res);
 	fib_combine_itag(itag, &res);
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-- 
cgit v1.2.3


From 141518c95870228da4e050fbe31a8f0c9df82c72 Mon Sep 17 00:00:00 2001
From: Matt Carlson <mcarlson@broadcom.com>
Date: Thu, 3 Dec 2009 08:36:22 +0000
Subject: tg3: Add some VPD preprocessor constants

This patch cleans up the VPD code by creating preprocessor definitions
and using them in the place of hardcoded constants.

Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Reviewed-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tg3.c       | 15 ++++++++-------
 drivers/net/tg3.h       | 10 ++++++++--
 include/linux/ethtool.h |  3 ++-
 3 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 36f2e1b8cbb1..2576effca845 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -12433,7 +12433,7 @@ skip_phy_reset:
 
 static void __devinit tg3_read_partno(struct tg3 *tp)
 {
-	unsigned char vpd_data[256];   /* in little-endian format */
+	unsigned char vpd_data[TG3_NVM_VPD_LEN];   /* in little-endian format */
 	unsigned int i;
 	u32 magic;
 
@@ -12442,14 +12442,14 @@ static void __devinit tg3_read_partno(struct tg3 *tp)
 		goto out_not_found;
 
 	if (magic == TG3_EEPROM_MAGIC) {
-		for (i = 0; i < 256; i += 4) {
+		for (i = 0; i < TG3_NVM_VPD_LEN; i += 4) {
 			u32 tmp;
 
 			/* The data is in little-endian format in NVRAM.
 			 * Use the big-endian read routines to preserve
 			 * the byte order as it exists in NVRAM.
 			 */
-			if (tg3_nvram_read_be32(tp, 0x100 + i, &tmp))
+			if (tg3_nvram_read_be32(tp, TG3_NVM_VPD_OFF + i, &tmp))
 				goto out_not_found;
 
 			memcpy(&vpd_data[i], &tmp, sizeof(tmp));
@@ -12458,7 +12458,7 @@ static void __devinit tg3_read_partno(struct tg3 *tp)
 		int vpd_cap;
 
 		vpd_cap = pci_find_capability(tp->pdev, PCI_CAP_ID_VPD);
-		for (i = 0; i < 256; i += 4) {
+		for (i = 0; i < TG3_NVM_VPD_LEN; i += 4) {
 			u32 tmp, j = 0;
 			__le32 v;
 			u16 tmp16;
@@ -12483,7 +12483,7 @@ static void __devinit tg3_read_partno(struct tg3 *tp)
 	}
 
 	/* Now parse and find the part number. */
-	for (i = 0; i < 254; ) {
+	for (i = 0; i < TG3_NVM_VPD_LEN - 2; ) {
 		unsigned char val = vpd_data[i];
 		unsigned int block_end;
 
@@ -12502,7 +12502,7 @@ static void __devinit tg3_read_partno(struct tg3 *tp)
 			      (vpd_data[i + 2] << 8)));
 		i += 3;
 
-		if (block_end > 256)
+		if (block_end > TG3_NVM_VPD_LEN)
 			goto out_not_found;
 
 		while (i < (block_end - 2)) {
@@ -12511,7 +12511,8 @@ static void __devinit tg3_read_partno(struct tg3 *tp)
 				int partno_len = vpd_data[i + 2];
 
 				i += 3;
-				if (partno_len > 24 || (partno_len + i) > 256)
+				if (partno_len > TG3_BPN_SIZE ||
+				    (partno_len + i) > TG3_NVM_VPD_LEN)
 					goto out_not_found;
 
 				memcpy(tp->board_part_number,
diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h
index 6a2c31071caf..cd30889650f8 100644
--- a/drivers/net/tg3.h
+++ b/drivers/net/tg3.h
@@ -1821,6 +1821,11 @@
 
 #define TG3_OTP_DEFAULT			0x286c1640
 
+
+/* Hardware Legacy NVRAM layout */
+#define TG3_NVM_VPD_OFF			0x100
+#define TG3_NVM_VPD_LEN			256
+
 /* Hardware Selfboot NVRAM layout */
 #define TG3_NVM_HWSB_CFG1		0x00000004
 #define  TG3_NVM_HWSB_CFG1_MAJMSK	0xf8000000
@@ -2893,8 +2898,9 @@ struct tg3 {
 	u32				led_ctrl;
 	u32				phy_otp;
 
-	char				board_part_number[24];
-#define TG3_VER_SIZE 32
+#define TG3_BPN_SIZE			24
+	char				board_part_number[TG3_BPN_SIZE];
+#define TG3_VER_SIZE			ETHTOOL_FWVERS_LEN
 	char				fw_ver[TG3_VER_SIZE];
 	u32				nic_sram_data_cfg;
 	u32				pci_clock_ctrl;
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index bcaa0e0a54d3..ef4a2d84d922 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -49,13 +49,14 @@ static inline __u32 ethtool_cmd_speed(struct ethtool_cmd *ep)
 	return (ep->speed_hi << 16) | ep->speed;
 }
 
+#define ETHTOOL_FWVERS_LEN	32
 #define ETHTOOL_BUSINFO_LEN	32
 /* these strings are set to whatever the driver author decides... */
 struct ethtool_drvinfo {
 	__u32	cmd;
 	char	driver[32];	/* driver short name, "tulip", "eepro100" */
 	char	version[32];	/* driver version string */
-	char	fw_version[32];	/* firmware version string, if applicable */
+	char	fw_version[ETHTOOL_FWVERS_LEN];	/* firmware version string */
 	char	bus_info[ETHTOOL_BUSINFO_LEN];	/* Bus info for this IF. */
 				/* For PCI devices, use pci_name(pci_dev). */
 	char	reserved1[32];
-- 
cgit v1.2.3


From e78db4dfb1355a895f7ea50133b702b55b8ed184 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Thu, 26 Nov 2009 22:46:03 -0500
Subject: libata: Report zeroed read after TRIM and max discard size

Our current TRIM payload is a single sector that can accommodate 64 *
65535 blocks being unmapped.  Report this value in the Block Limits
Maximum Unmap LBA count field.

If a storage device supports TRIM and the DRAT and RZAT bits are set,
report TPRZ=1 in Read Capacity(16).

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/libata-scsi.c | 12 +++++++++---
 include/linux/ata.h       | 11 +++++++++++
 2 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 340a616d226b..e1e186b9dfcc 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -2115,8 +2115,10 @@ static unsigned int ata_scsiop_inq_b0(struct ata_scsi_args *args, u8 *rbuf)
 	 * that we support some form of unmap - in thise case via WRITE SAME
 	 * with the unmap bit set.
 	 */
-	if (ata_id_has_trim(args->id))
+	if (ata_id_has_trim(args->id)) {
+		put_unaligned_be32(65535 * 512 / 8, &rbuf[20]);
 		put_unaligned_be32(1, &rbuf[28]);
+	}
 
 	return 0;
 }
@@ -2411,8 +2413,12 @@ static unsigned int ata_scsiop_read_cap(struct ata_scsi_args *args, u8 *rbuf)
 		rbuf[14] = (lowest_aligned >> 8) & 0x3f;
 		rbuf[15] = lowest_aligned;
 
-		if (ata_id_has_trim(args->id))
-			rbuf[14] |= 0x80;
+		if (ata_id_has_trim(args->id)) {
+			rbuf[14] |= 0x80; /* TPE */
+
+			if (ata_id_has_zero_after_trim(args->id))
+				rbuf[14] |= 0x40; /* TPRZ */
+		}
 	}
 
 	return 0;
diff --git a/include/linux/ata.h b/include/linux/ata.h
index e2595e877e44..dfa2298bf488 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -75,6 +75,7 @@ enum {
 	ATA_ID_EIDE_DMA_TIME	= 66,
 	ATA_ID_EIDE_PIO		= 67,
 	ATA_ID_EIDE_PIO_IORDY	= 68,
+	ATA_ID_ADDITIONAL_SUPP	= 69,
 	ATA_ID_QUEUE_DEPTH	= 75,
 	ATA_ID_MAJOR_VER	= 80,
 	ATA_ID_COMMAND_SET_1	= 82,
@@ -816,6 +817,16 @@ static inline int ata_id_has_trim(const u16 *id)
 	return 0;
 }
 
+static inline int ata_id_has_zero_after_trim(const u16 *id)
+{
+	/* DSM supported, deterministic read, and read zero after trim set */
+	if (ata_id_has_trim(id) &&
+	    (id[ATA_ID_ADDITIONAL_SUPP] & 0x4020) == 0x4020)
+		return 1;
+
+	return 0;
+}
+
 static inline int ata_id_current_chs_valid(const u16 *id)
 {
 	/* For ATA-1 devices, if the INITIALIZE DEVICE PARAMETERS command
-- 
cgit v1.2.3


From d0634c4aea0b80447cbdc4c0db285004b860c455 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Thu, 26 Nov 2009 12:00:43 -0500
Subject: libata: Clarify ata_set_lba_range_entries function

ata_set_lba_range_entries used the variable max for two different things
which was confusing.  Make the function take a buffer size in bytes as
argument and return the used buffer size upon completion.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/libata-scsi.c |  2 +-
 include/linux/ata.h       | 20 ++++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index e1e186b9dfcc..62e6b9ea96af 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -2972,7 +2972,7 @@ static unsigned int ata_scsi_write_same_xlat(struct ata_queued_cmd *qc)
 		goto invalid_fld;
 
 	buf = page_address(sg_page(scsi_sglist(scmd)));
-	size = ata_set_lba_range_entries(buf, 512 / 8, block, n_block);
+	size = ata_set_lba_range_entries(buf, 512, block, n_block);
 
 	tf->protocol = ATA_PROT_DMA;
 	tf->hob_feature = 0;
diff --git a/include/linux/ata.h b/include/linux/ata.h
index dfa2298bf488..38a6948ce0c2 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -982,17 +982,17 @@ static inline void ata_id_to_hd_driveid(u16 *id)
 }
 
 /*
- * Write up to 'max' LBA Range Entries to the buffer that will cover the
- * extent from sector to sector + count.  This is used for TRIM and for
- * ADD LBA(S) TO NV CACHE PINNED SET.
+ * Write LBA Range Entries to the buffer that will cover the extent from
+ * sector to sector + count.  This is used for TRIM and for ADD LBA(S)
+ * TO NV CACHE PINNED SET.
  */
-static inline unsigned ata_set_lba_range_entries(void *_buffer, unsigned max,
-						u64 sector, unsigned long count)
+static inline unsigned ata_set_lba_range_entries(void *_buffer,
+		unsigned buf_size, u64 sector, unsigned long count)
 {
 	__le64 *buffer = _buffer;
-	unsigned i = 0;
+	unsigned i = 0, used_bytes;
 
-	while (i < max) {
+	while (i < buf_size / 8 ) { /* 6-byte LBA + 2-byte range per entry */
 		u64 entry = sector |
 			((u64)(count > 0xffff ? 0xffff : count) << 48);
 		buffer[i++] = __cpu_to_le64(entry);
@@ -1002,9 +1002,9 @@ static inline unsigned ata_set_lba_range_entries(void *_buffer, unsigned max,
 		sector += 0xffff;
 	}
 
-	max = ALIGN(i * 8, 512);
-	memset(buffer + i, 0, max - i * 8);
-	return max;
+	used_bytes = ALIGN(i * 8, 512);
+	memset(buffer + i, 0, used_bytes - i * 8);
+	return used_bytes;
 }
 
 static inline int is_multi_taskfile(struct ata_taskfile *tf)
-- 
cgit v1.2.3


From 69ee472f2706371ca639de49b06df91615c07d8d Mon Sep 17 00:00:00 2001
From: Oliver Neukum <oliver@neukum.org>
Date: Thu, 3 Dec 2009 15:31:18 -0800
Subject: usbnet & cdc-ether: Autosuspend for online devices

Using remote wakeup and delayed transmission to allow
online device to go into usb autosuspend.
Minimal alternate support for devices that don't support
remote wakeup.

Signed-off-by: Oliver Neukum <oliver@neukum.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/cdc_ether.c |   8 +++
 drivers/net/usb/usbnet.c    | 157 ++++++++++++++++++++++++++++++++++++--------
 include/linux/usb/usbnet.h  |   6 ++
 3 files changed, 142 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c
index 197912d9c04a..21e183a83b99 100644
--- a/drivers/net/usb/cdc_ether.c
+++ b/drivers/net/usb/cdc_ether.c
@@ -411,6 +411,12 @@ static int cdc_bind(struct usbnet *dev, struct usb_interface *intf)
 	return 0;
 }
 
+static int cdc_manage_power(struct usbnet *dev, int on)
+{
+	dev->intf->needs_remote_wakeup = on;
+	return 0;
+}
+
 static const struct driver_info	cdc_info = {
 	.description =	"CDC Ethernet Device",
 	.flags =	FLAG_ETHER | FLAG_LINK_INTR,
@@ -418,6 +424,7 @@ static const struct driver_info	cdc_info = {
 	.bind =		cdc_bind,
 	.unbind =	usbnet_cdc_unbind,
 	.status =	cdc_status,
+	.manage_power =	cdc_manage_power,
 };
 
 static const struct driver_info mbm_info = {
@@ -619,6 +626,7 @@ static struct usb_driver cdc_driver = {
 	.suspend =	usbnet_suspend,
 	.resume =	usbnet_resume,
 	.reset_resume =	usbnet_resume,
+	.supports_autosuspend = 1,
 };
 
 
diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 511e7bdc4573..035fab04c0a0 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -353,7 +353,8 @@ static void rx_submit (struct usbnet *dev, struct urb *urb, gfp_t flags)
 
 	if (netif_running (dev->net) &&
 	    netif_device_present (dev->net) &&
-	    !test_bit (EVENT_RX_HALT, &dev->flags)) {
+	    !test_bit (EVENT_RX_HALT, &dev->flags) &&
+	    !test_bit (EVENT_DEV_ASLEEP, &dev->flags)) {
 		switch (retval = usb_submit_urb (urb, GFP_ATOMIC)) {
 		case -EPIPE:
 			usbnet_defer_kevent (dev, EVENT_RX_HALT);
@@ -611,15 +612,39 @@ EXPORT_SYMBOL_GPL(usbnet_unlink_rx_urbs);
 /*-------------------------------------------------------------------------*/
 
 // precondition: never called in_interrupt
+static void usbnet_terminate_urbs(struct usbnet *dev)
+{
+	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(unlink_wakeup);
+	DECLARE_WAITQUEUE(wait, current);
+	int temp;
+
+	/* ensure there are no more active urbs */
+	add_wait_queue(&unlink_wakeup, &wait);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	dev->wait = &unlink_wakeup;
+	temp = unlink_urbs(dev, &dev->txq) +
+		unlink_urbs(dev, &dev->rxq);
+
+	/* maybe wait for deletions to finish. */
+	while (!skb_queue_empty(&dev->rxq)
+		&& !skb_queue_empty(&dev->txq)
+		&& !skb_queue_empty(&dev->done)) {
+			schedule_timeout(UNLINK_TIMEOUT_MS);
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			if (netif_msg_ifdown(dev))
+				devdbg(dev, "waited for %d urb completions",
+					temp);
+	}
+	set_current_state(TASK_RUNNING);
+	dev->wait = NULL;
+	remove_wait_queue(&unlink_wakeup, &wait);
+}
 
 int usbnet_stop (struct net_device *net)
 {
 	struct usbnet		*dev = netdev_priv(net);
 	struct driver_info	*info = dev->driver_info;
-	int			temp;
 	int			retval;
-	DECLARE_WAIT_QUEUE_HEAD_ONSTACK (unlink_wakeup);
-	DECLARE_WAITQUEUE (wait, current);
 
 	netif_stop_queue (net);
 
@@ -641,25 +666,8 @@ int usbnet_stop (struct net_device *net)
 				info->description);
 	}
 
-	if (!(info->flags & FLAG_AVOID_UNLINK_URBS)) {
-		/* ensure there are no more active urbs */
-		add_wait_queue(&unlink_wakeup, &wait);
-		dev->wait = &unlink_wakeup;
-		temp = unlink_urbs(dev, &dev->txq) +
-			unlink_urbs(dev, &dev->rxq);
-
-		/* maybe wait for deletions to finish. */
-		while (!skb_queue_empty(&dev->rxq) &&
-		       !skb_queue_empty(&dev->txq) &&
-		       !skb_queue_empty(&dev->done)) {
-			msleep(UNLINK_TIMEOUT_MS);
-			if (netif_msg_ifdown(dev))
-				devdbg(dev, "waited for %d urb completions",
-					temp);
-		}
-		dev->wait = NULL;
-		remove_wait_queue(&unlink_wakeup, &wait);
-	}
+	if (!(info->flags & FLAG_AVOID_UNLINK_URBS))
+		usbnet_terminate_urbs(dev);
 
 	usb_kill_urb(dev->interrupt);
 
@@ -672,7 +680,10 @@ int usbnet_stop (struct net_device *net)
 	dev->flags = 0;
 	del_timer_sync (&dev->delay);
 	tasklet_kill (&dev->bh);
-	usb_autopm_put_interface(dev->intf);
+	if (info->manage_power)
+		info->manage_power(dev, 0);
+	else
+		usb_autopm_put_interface(dev->intf);
 
 	return 0;
 }
@@ -753,6 +764,12 @@ int usbnet_open (struct net_device *net)
 
 	// delay posting reads until we're fully open
 	tasklet_schedule (&dev->bh);
+	if (info->manage_power) {
+		retval = info->manage_power(dev, 1);
+		if (retval < 0)
+			goto done;
+		usb_autopm_put_interface(dev->intf);
+	}
 	return retval;
 done:
 	usb_autopm_put_interface(dev->intf);
@@ -881,11 +898,16 @@ kevent (struct work_struct *work)
 	/* usb_clear_halt() needs a thread context */
 	if (test_bit (EVENT_TX_HALT, &dev->flags)) {
 		unlink_urbs (dev, &dev->txq);
+		status = usb_autopm_get_interface(dev->intf);
+		if (status < 0)
+			goto fail_pipe;
 		status = usb_clear_halt (dev->udev, dev->out);
+		usb_autopm_put_interface(dev->intf);
 		if (status < 0 &&
 		    status != -EPIPE &&
 		    status != -ESHUTDOWN) {
 			if (netif_msg_tx_err (dev))
+fail_pipe:
 				deverr (dev, "can't clear tx halt, status %d",
 					status);
 		} else {
@@ -896,11 +918,16 @@ kevent (struct work_struct *work)
 	}
 	if (test_bit (EVENT_RX_HALT, &dev->flags)) {
 		unlink_urbs (dev, &dev->rxq);
+		status = usb_autopm_get_interface(dev->intf);
+		if (status < 0)
+			goto fail_halt;
 		status = usb_clear_halt (dev->udev, dev->in);
+		usb_autopm_put_interface(dev->intf);
 		if (status < 0 &&
 		    status != -EPIPE &&
 		    status != -ESHUTDOWN) {
 			if (netif_msg_rx_err (dev))
+fail_halt:
 				deverr (dev, "can't clear rx halt, status %d",
 					status);
 		} else {
@@ -919,7 +946,12 @@ kevent (struct work_struct *work)
 			clear_bit (EVENT_RX_MEMORY, &dev->flags);
 		if (urb != NULL) {
 			clear_bit (EVENT_RX_MEMORY, &dev->flags);
+			status = usb_autopm_get_interface(dev->intf);
+			if (status < 0)
+				goto fail_lowmem;
 			rx_submit (dev, urb, GFP_KERNEL);
+			usb_autopm_put_interface(dev->intf);
+fail_lowmem:
 			tasklet_schedule (&dev->bh);
 		}
 	}
@@ -929,11 +961,18 @@ kevent (struct work_struct *work)
 		int			retval = 0;
 
 		clear_bit (EVENT_LINK_RESET, &dev->flags);
+		status = usb_autopm_get_interface(dev->intf);
+		if (status < 0)
+			goto skip_reset;
 		if(info->link_reset && (retval = info->link_reset(dev)) < 0) {
+			usb_autopm_put_interface(dev->intf);
+skip_reset:
 			devinfo(dev, "link reset failed (%d) usbnet usb-%s-%s, %s",
 				retval,
 				dev->udev->bus->bus_name, dev->udev->devpath,
 				info->description);
+		} else {
+			usb_autopm_put_interface(dev->intf);
 		}
 	}
 
@@ -971,6 +1010,7 @@ static void tx_complete (struct urb *urb)
 		case -EPROTO:
 		case -ETIME:
 		case -EILSEQ:
+			usb_mark_last_busy(dev->udev);
 			if (!timer_pending (&dev->delay)) {
 				mod_timer (&dev->delay,
 					jiffies + THROTTLE_JIFFIES);
@@ -987,6 +1027,7 @@ static void tx_complete (struct urb *urb)
 		}
 	}
 
+	usb_autopm_put_interface_async(dev->intf);
 	urb->dev = NULL;
 	entry->state = tx_done;
 	defer_bh(dev, skb, &dev->txq);
@@ -1057,14 +1098,34 @@ netdev_tx_t usbnet_start_xmit (struct sk_buff *skb,
 		}
 	}
 
-	spin_lock_irqsave (&dev->txq.lock, flags);
+	spin_lock_irqsave(&dev->txq.lock, flags);
+	retval = usb_autopm_get_interface_async(dev->intf);
+	if (retval < 0) {
+		spin_unlock_irqrestore(&dev->txq.lock, flags);
+		goto drop;
+	}
+
+#ifdef CONFIG_PM
+	/* if this triggers the device is still a sleep */
+	if (test_bit(EVENT_DEV_ASLEEP, &dev->flags)) {
+		/* transmission will be done in resume */
+		usb_anchor_urb(urb, &dev->deferred);
+		/* no use to process more packets */
+		netif_stop_queue(net);
+		spin_unlock_irqrestore(&dev->txq.lock, flags);
+		devdbg(dev, "Delaying transmission for resumption");
+		goto deferred;
+	}
+#endif
 
 	switch ((retval = usb_submit_urb (urb, GFP_ATOMIC))) {
 	case -EPIPE:
 		netif_stop_queue (net);
 		usbnet_defer_kevent (dev, EVENT_TX_HALT);
+		usb_autopm_put_interface_async(dev->intf);
 		break;
 	default:
+		usb_autopm_put_interface_async(dev->intf);
 		if (netif_msg_tx_err (dev))
 			devdbg (dev, "tx: submit urb err %d", retval);
 		break;
@@ -1088,6 +1149,9 @@ drop:
 		devdbg (dev, "> tx, len %d, type 0x%x",
 			length, skb->protocol);
 	}
+#ifdef CONFIG_PM
+deferred:
+#endif
 	return NETDEV_TX_OK;
 }
 EXPORT_SYMBOL_GPL(usbnet_start_xmit);
@@ -1263,6 +1327,7 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod)
 	dev->bh.func = usbnet_bh;
 	dev->bh.data = (unsigned long) dev;
 	INIT_WORK (&dev->kevent, kevent);
+	init_usb_anchor(&dev->deferred);
 	dev->delay.function = usbnet_bh;
 	dev->delay.data = (unsigned long) dev;
 	init_timer (&dev->delay);
@@ -1382,13 +1447,23 @@ int usbnet_suspend (struct usb_interface *intf, pm_message_t message)
 	struct usbnet		*dev = usb_get_intfdata(intf);
 
 	if (!dev->suspend_count++) {
+		spin_lock_irq(&dev->txq.lock);
+		/* don't autosuspend while transmitting */
+		if (dev->txq.qlen && (message.event & PM_EVENT_AUTO)) {
+			spin_unlock_irq(&dev->txq.lock);
+			return -EBUSY;
+		} else {
+			set_bit(EVENT_DEV_ASLEEP, &dev->flags);
+			spin_unlock_irq(&dev->txq.lock);
+		}
 		/*
 		 * accelerate emptying of the rx and queues, to avoid
 		 * having everything error out.
 		 */
 		netif_device_detach (dev->net);
-		(void) unlink_urbs (dev, &dev->rxq);
-		(void) unlink_urbs (dev, &dev->txq);
+		usbnet_terminate_urbs(dev);
+		usb_kill_urb(dev->interrupt);
+
 		/*
 		 * reattach so runtime management can use and
 		 * wake the device
@@ -1402,10 +1477,34 @@ EXPORT_SYMBOL_GPL(usbnet_suspend);
 int usbnet_resume (struct usb_interface *intf)
 {
 	struct usbnet		*dev = usb_get_intfdata(intf);
+	struct sk_buff          *skb;
+	struct urb              *res;
+	int                     retval;
+
+	if (!--dev->suspend_count) {
+		spin_lock_irq(&dev->txq.lock);
+		while ((res = usb_get_from_anchor(&dev->deferred))) {
+
+			printk(KERN_INFO"%s has delayed data\n", __func__);
+			skb = (struct sk_buff *)res->context;
+			retval = usb_submit_urb(res, GFP_ATOMIC);
+			if (retval < 0) {
+				dev_kfree_skb_any(skb);
+				usb_free_urb(res);
+				usb_autopm_put_interface_async(dev->intf);
+			} else {
+				dev->net->trans_start = jiffies;
+				__skb_queue_tail(&dev->txq, skb);
+			}
+		}
 
-	if (!--dev->suspend_count)
+		smp_mb();
+		clear_bit(EVENT_DEV_ASLEEP, &dev->flags);
+		spin_unlock_irq(&dev->txq.lock);
+		if (!(dev->txq.qlen >= TX_QLEN(dev)))
+			netif_start_queue(dev->net);
 		tasklet_schedule (&dev->bh);
-
+	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(usbnet_resume);
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index 8c84881f8478..8ce61359bf73 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -55,6 +55,7 @@ struct usbnet {
 	struct sk_buff_head	done;
 	struct sk_buff_head	rxq_pause;
 	struct urb		*interrupt;
+	struct usb_anchor	deferred;
 	struct tasklet_struct	bh;
 
 	struct work_struct	kevent;
@@ -65,6 +66,8 @@ struct usbnet {
 #		define EVENT_STS_SPLIT	3
 #		define EVENT_LINK_RESET	4
 #		define EVENT_RX_PAUSED	5
+#		define EVENT_DEV_WAKING 6
+#		define EVENT_DEV_ASLEEP 7
 };
 
 static inline struct usb_driver *driver_of(struct usb_interface *intf)
@@ -109,6 +112,9 @@ struct driver_info {
 	/* see if peer is connected ... can sleep */
 	int	(*check_connect)(struct usbnet *);
 
+	/* (dis)activate runtime power management */
+	int	(*manage_power)(struct usbnet *, int);
+
 	/* for status polling */
 	void	(*status)(struct usbnet *, struct urb *);
 
-- 
cgit v1.2.3


From fc4a7489663250360cd40d5adf06a08d1c5d54df Mon Sep 17 00:00:00 2001
From: Patrick Mullaney <pmullaney@novell.com>
Date: Thu, 3 Dec 2009 15:59:22 -0800
Subject: netdevice: provide common routine for macvlan and vlan operstate
 management

Provide common routine for the transition of operational state for a leaf
device during a root device transition.

Signed-off-by: Patrick Mullaney <pmullaney@novell.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvlan.c     | 24 +++---------------------
 include/linux/netdevice.h |  3 +++
 net/8021q/vlan.c          | 29 ++++-------------------------
 net/core/dev.c            | 27 +++++++++++++++++++++++++++
 4 files changed, 37 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 93c3e6edf702..21a9c9ab4b34 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -582,25 +582,6 @@ static void macvlan_port_destroy(struct net_device *dev)
 	kfree(port);
 }
 
-static void macvlan_transfer_operstate(struct net_device *dev)
-{
-	struct macvlan_dev *vlan = netdev_priv(dev);
-	const struct net_device *lowerdev = vlan->lowerdev;
-
-	if (lowerdev->operstate == IF_OPER_DORMANT)
-		netif_dormant_on(dev);
-	else
-		netif_dormant_off(dev);
-
-	if (netif_carrier_ok(lowerdev)) {
-		if (!netif_carrier_ok(dev))
-			netif_carrier_on(dev);
-	} else {
-		if (netif_carrier_ok(dev))
-			netif_carrier_off(dev);
-	}
-}
-
 static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[])
 {
 	if (tb[IFLA_ADDRESS]) {
@@ -693,7 +674,7 @@ static int macvlan_newlink(struct net *src_net, struct net_device *dev,
 		return err;
 
 	list_add_tail(&vlan->list, &port->vlans);
-	macvlan_transfer_operstate(dev);
+	netif_stacked_transfer_operstate(lowerdev, dev);
 	return 0;
 }
 
@@ -768,7 +749,8 @@ static int macvlan_device_event(struct notifier_block *unused,
 	switch (event) {
 	case NETDEV_CHANGE:
 		list_for_each_entry(vlan, &port->vlans, list)
-			macvlan_transfer_operstate(vlan->dev);
+			netif_stacked_transfer_operstate(vlan->lowerdev,
+							 vlan->dev);
 		break;
 	case NETDEV_FEAT_CHANGE:
 		list_for_each_entry(vlan, &port->vlans, list) {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index daf13d367498..a3fccc85b1a0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1981,6 +1981,9 @@ unsigned long netdev_increment_features(unsigned long all, unsigned long one,
 					unsigned long mask);
 unsigned long netdev_fix_features(unsigned long features, const char *name);
 
+void netif_stacked_transfer_operstate(const struct net_device *rootdev,
+					struct net_device *dev);
+
 static inline int net_gso_ok(int features, int gso_type)
 {
 	int feature = gso_type << NETIF_F_GSO_SHIFT;
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index ec3769295dac..33f90e7362cc 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -184,27 +184,6 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
 	dev_put(real_dev);
 }
 
-static void vlan_transfer_operstate(const struct net_device *dev,
-				    struct net_device *vlandev)
-{
-	/* Have to respect userspace enforced dormant state
-	 * of real device, also must allow supplicant running
-	 * on VLAN device
-	 */
-	if (dev->operstate == IF_OPER_DORMANT)
-		netif_dormant_on(vlandev);
-	else
-		netif_dormant_off(vlandev);
-
-	if (netif_carrier_ok(dev)) {
-		if (!netif_carrier_ok(vlandev))
-			netif_carrier_on(vlandev);
-	} else {
-		if (netif_carrier_ok(vlandev))
-			netif_carrier_off(vlandev);
-	}
-}
-
 int vlan_check_real_dev(struct net_device *real_dev, u16 vlan_id)
 {
 	const char *name = real_dev->name;
@@ -262,7 +241,7 @@ int register_vlan_dev(struct net_device *dev)
 	/* Account for reference in struct vlan_dev_info */
 	dev_hold(real_dev);
 
-	vlan_transfer_operstate(real_dev, dev);
+	netif_stacked_transfer_operstate(real_dev, dev);
 	linkwatch_fire_event(dev); /* _MUST_ call rfc2863_policy() */
 
 	/* So, got the sucker initialized, now lets place
@@ -453,7 +432,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 			if (!vlandev)
 				continue;
 
-			vlan_transfer_operstate(dev, vlandev);
+			netif_stacked_transfer_operstate(dev, vlandev);
 		}
 		break;
 
@@ -511,7 +490,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 			vlan = vlan_dev_info(vlandev);
 			if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING))
 				dev_change_flags(vlandev, flgs & ~IFF_UP);
-			vlan_transfer_operstate(dev, vlandev);
+			netif_stacked_transfer_operstate(dev, vlandev);
 		}
 		break;
 
@@ -529,7 +508,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 			vlan = vlan_dev_info(vlandev);
 			if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING))
 				dev_change_flags(vlandev, flgs | IFF_UP);
-			vlan_transfer_operstate(dev, vlandev);
+			netif_stacked_transfer_operstate(dev, vlandev);
 		}
 		break;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 0913a08a87d6..c36a17aafcf3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4900,6 +4900,33 @@ unsigned long netdev_fix_features(unsigned long features, const char *name)
 }
 EXPORT_SYMBOL(netdev_fix_features);
 
+/**
+ *	netif_stacked_transfer_operstate -	transfer operstate
+ *	@rootdev: the root or lower level device to transfer state from
+ *	@dev: the device to transfer operstate to
+ *
+ *	Transfer operational state from root to device. This is normally
+ *	called when a stacking relationship exists between the root
+ *	device and the device(a leaf device).
+ */
+void netif_stacked_transfer_operstate(const struct net_device *rootdev,
+					struct net_device *dev)
+{
+	if (rootdev->operstate == IF_OPER_DORMANT)
+		netif_dormant_on(dev);
+	else
+		netif_dormant_off(dev);
+
+	if (netif_carrier_ok(rootdev)) {
+		if (!netif_carrier_ok(dev))
+			netif_carrier_on(dev);
+	} else {
+		if (netif_carrier_ok(dev))
+			netif_carrier_off(dev);
+	}
+}
+EXPORT_SYMBOL(netif_stacked_transfer_operstate);
+
 /**
  *	register_netdevice	- register a network device
  *	@dev: device to register
-- 
cgit v1.2.3


From b69f2292063d2caf37ca9aec7d63ded203701bf3 Mon Sep 17 00:00:00 2001
From: Louis Rilling <louis.rilling@kerlabs.com>
Date: Fri, 4 Dec 2009 14:52:42 +0100
Subject: block: Fix io_context leak after failure of clone with CLONE_IO

With CLONE_IO, parent's io_context->nr_tasks is incremented, but never
decremented whenever copy_process() fails afterwards, which prevents
exit_io_context() from calling IO schedulers exit functions.

Give a task_struct to exit_io_context(), and call exit_io_context() instead of
put_io_context() in copy_process() cleanup path.

Signed-off-by: Louis Rilling <louis.rilling@kerlabs.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-ioc.c           | 10 +++++-----
 include/linux/iocontext.h |  5 +++--
 kernel/exit.c             |  2 +-
 kernel/fork.c             |  3 ++-
 4 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index dcd041290b28..cbdabb0dd6d7 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -66,14 +66,14 @@ static void cfq_exit(struct io_context *ioc)
 }
 
 /* Called by the exitting task */
-void exit_io_context(void)
+void exit_io_context(struct task_struct *task)
 {
 	struct io_context *ioc;
 
-	task_lock(current);
-	ioc = current->io_context;
-	current->io_context = NULL;
-	task_unlock(current);
+	task_lock(task);
+	ioc = task->io_context;
+	task->io_context = NULL;
+	task_unlock(task);
 
 	if (atomic_dec_and_test(&ioc->nr_tasks)) {
 		if (ioc->aic && ioc->aic->exit)
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index d61b0b8b5cd1..a63235996309 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -98,14 +98,15 @@ static inline struct io_context *ioc_task_link(struct io_context *ioc)
 	return NULL;
 }
 
+struct task_struct;
 #ifdef CONFIG_BLOCK
 int put_io_context(struct io_context *ioc);
-void exit_io_context(void);
+void exit_io_context(struct task_struct *task);
 struct io_context *get_io_context(gfp_t gfp_flags, int node);
 struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
 void copy_io_context(struct io_context **pdst, struct io_context **psrc);
 #else
-static inline void exit_io_context(void)
+static inline void exit_io_context(struct task_struct *task)
 {
 }
 
diff --git a/kernel/exit.c b/kernel/exit.c
index f7864ac2ecc1..2544000125d9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1004,7 +1004,7 @@ NORET_TYPE void do_exit(long code)
 	tsk->flags |= PF_EXITPIDONE;
 
 	if (tsk->io_context)
-		exit_io_context();
+		exit_io_context(tsk);
 
 	if (tsk->splice_pipe)
 		__free_pipe_info(tsk->splice_pipe);
diff --git a/kernel/fork.c b/kernel/fork.c
index 166b8c49257c..607353425bb0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1310,7 +1310,8 @@ bad_fork_free_pid:
 	if (pid != &init_struct_pid)
 		free_pid(pid);
 bad_fork_cleanup_io:
-	put_io_context(p->io_context);
+	if (p->io_context)
+		exit_io_context(p);
 bad_fork_cleanup_namespaces:
 	exit_task_namespaces(p);
 bad_fork_cleanup_mm:
-- 
cgit v1.2.3


From 38938c879eb0c39edf85d5164aa0cffe2874304c Mon Sep 17 00:00:00 2001
From: David Daney <ddaney@caviumnetworks.com>
Date: Fri, 4 Dec 2009 17:44:50 -0800
Subject: Add support for GCC-4.5's __builtin_unreachable() to compiler.h (v2)

Starting with version 4.5, GCC has a new built-in function
__builtin_unreachable() that can be used in places like the kernel's
BUG() where inline assembly is used to transfer control flow.  This
eliminated the need for an endless loop in these places.

The patch adds a new macro 'unreachable()' that will expand to either
__builtin_unreachable() or an endless loop depending on the compiler
version.

Change from v1: Simplify unreachable() for non-GCC 4.5 case.

Signed-off-by: David Daney <ddaney@caviumnetworks.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc4.h | 14 ++++++++++++++
 include/linux/compiler.h      |  5 +++++
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
index 450fa597c94d..ab3af40a53c6 100644
--- a/include/linux/compiler-gcc4.h
+++ b/include/linux/compiler-gcc4.h
@@ -36,4 +36,18 @@
    the kernel context */
 #define __cold			__attribute__((__cold__))
 
+
+#if __GNUC_MINOR__ >= 5
+/*
+ * Mark a position in code as unreachable.  This can be used to
+ * suppress control flow warnings after asm blocks that transfer
+ * control elsewhere.
+ *
+ * Early snapshots of gcc 4.5 don't support this and we can't detect
+ * this in the preprocessor, but we can live with this because they're
+ * unreleased.  Really, we need to have autoconf for the kernel.
+ */
+#define unreachable() __builtin_unreachable()
+#endif
+
 #endif
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 04fb5135b4e1..59f208926d13 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -144,6 +144,11 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 # define barrier() __memory_barrier()
 #endif
 
+/* Unreachable code */
+#ifndef unreachable
+# define unreachable() do { } while (1)
+#endif
+
 #ifndef RELOC_HIDE
 # define RELOC_HIDE(ptr, off)					\
   ({ unsigned long __ptr;					\
-- 
cgit v1.2.3


From 7a1a8eb58a2c6cd819d17332c5a2c369203635d5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 3 Dec 2009 21:19:18 +0100
Subject: PM: Add flag for devices capable of generating run-time wake-up
 events

Apparently, there are devices that can wake up the system from sleep
states and yet are incapable of generating wake-up events at run
time.  Thus, introduce a flag indicating if given device is capable
of generating run-time wake-up events.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 Documentation/power/runtime_pm.txt |  7 +++++--
 include/linux/pm.h                 |  8 +++++---
 include/linux/pm_runtime.h         | 12 ++++++++++++
 3 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt
index 6bb25cb24da9..4a3109b28847 100644
--- a/Documentation/power/runtime_pm.txt
+++ b/Documentation/power/runtime_pm.txt
@@ -71,9 +71,9 @@ what to do to handle the device).
     purpose).
 
 In particular, if the driver requires remote wakeup capability for proper
-functioning and device_may_wakeup() returns 'false' for the device, then
+functioning and device_run_wake() returns 'false' for the device, then
 ->runtime_suspend() should return -EBUSY.  On the other hand, if
-device_may_wakeup() returns 'true' for the device and the device is put
+device_run_wake() returns 'true' for the device and the device is put
 into a low power state during the execution of its bus type's
 ->runtime_suspend(), it is expected that remote wake-up (i.e. hardware mechanism
 allowing the device to request a change of its power state, such as PCI PME)
@@ -215,6 +215,9 @@ defined in include/linux/pm.h:
       being executed for that device and it is not practical to wait for the
       suspend to complete; means "start a resume as soon as you've suspended"
 
+  unsigned int run_wake;
+    - set if the device is capable of generating run-time wake-up events
+
   enum rpm_status runtime_status;
     - the run-time PM status of the device; this field's initial value is
       RPM_SUSPENDED, which means that each device is initially regarded by the
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 3b7e04b95bd2..0d65934246af 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -178,9 +178,10 @@ typedef struct pm_message {
  *	This need not mean that the device should be put into a low power state.
  *	For example, if the device is behind a link which is about to be turned
  *	off, the device may remain at full power.  If the device does go to low
- *	power and if device_may_wakeup(dev) is true, remote wake-up (i.e., a
- *	hardware mechanism allowing the device to request a change of its power
- *	state, such as PCI PME) should be enabled for it.
+ *	power and is capable of generating run-time wake-up events, remote
+ *	wake-up (i.e., a hardware mechanism allowing the device to request a
+ *	change of its power state via a wake-up event, such as PCI PME) should
+ *	be enabled for it.
  *
  * @runtime_resume: Put the device into the fully active state in response to a
  *	wake-up event generated by hardware or at the request of software.  If
@@ -428,6 +429,7 @@ struct dev_pm_info {
 	unsigned int		idle_notification:1;
 	unsigned int		request_pending:1;
 	unsigned int		deferred_resume:1;
+	unsigned int		run_wake:1;
 	enum rpm_request	request;
 	enum rpm_status		runtime_status;
 	int			runtime_error;
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 44087044910f..370ce0a6fe4a 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -50,6 +50,16 @@ static inline void pm_runtime_put_noidle(struct device *dev)
 	atomic_add_unless(&dev->power.usage_count, -1, 0);
 }
 
+static inline bool device_run_wake(struct device *dev)
+{
+	return dev->power.run_wake;
+}
+
+static inline void device_set_run_wake(struct device *dev, bool enable)
+{
+	dev->power.run_wake = enable;
+}
+
 #else /* !CONFIG_PM_RUNTIME */
 
 static inline int pm_runtime_idle(struct device *dev) { return -ENOSYS; }
@@ -73,6 +83,8 @@ static inline bool pm_children_suspended(struct device *dev) { return false; }
 static inline void pm_suspend_ignore_children(struct device *dev, bool en) {}
 static inline void pm_runtime_get_noresume(struct device *dev) {}
 static inline void pm_runtime_put_noidle(struct device *dev) {}
+static inline bool device_run_wake(struct device *dev) { return false; }
+static inline void device_set_run_wake(struct device *dev, bool enable) {}
 
 #endif /* !CONFIG_PM_RUNTIME */
 
-- 
cgit v1.2.3


From 194684e596af4bdaebb424166d94a8aa528edfda Mon Sep 17 00:00:00 2001
From: Mika Kuoppala <mika.kuoppala@nokia.com>
Date: Sun, 6 Dec 2009 17:06:22 +0100
Subject: i2c: Prevent priority inversion on top of bus lock

Low priority thread holding the i2c bus mutex could block higher
priority threads to access the bus resulting in unacceptable
latencies. Change the mutex type to rt_mutex preventing priority
inversion.

Tested-by: Peter Ujfalusi <peter.ujfalusi@nokia.com>
Signed-off-by: Mika Kuoppala <mika.kuoppala@nokia.com>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 drivers/i2c/Kconfig    |  1 +
 drivers/i2c/i2c-core.c | 12 ++++++------
 include/linux/i2c.h    |  7 +++----
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/Kconfig b/drivers/i2c/Kconfig
index d7ece131b4f4..8d8a00e5a30e 100644
--- a/drivers/i2c/Kconfig
+++ b/drivers/i2c/Kconfig
@@ -5,6 +5,7 @@
 menuconfig I2C
 	tristate "I2C support"
 	depends on HAS_IOMEM
+	select RT_MUTEXES
 	---help---
 	  I2C (pronounce: I-square-C) is a slow serial bus protocol used in
 	  many micro controller applications and developed by Philips.  SMBus,
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 296504355142..d664b4a97a31 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -584,7 +584,7 @@ static int i2c_register_adapter(struct i2c_adapter *adap)
 		goto out_list;
 	}
 
-	mutex_init(&adap->bus_lock);
+	rt_mutex_init(&adap->bus_lock);
 
 	/* Set default timeout to 1 second if not already set */
 	if (adap->timeout == 0)
@@ -1092,12 +1092,12 @@ int i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num)
 #endif
 
 		if (in_atomic() || irqs_disabled()) {
-			ret = mutex_trylock(&adap->bus_lock);
+			ret = rt_mutex_trylock(&adap->bus_lock);
 			if (!ret)
 				/* I2C activity is ongoing. */
 				return -EAGAIN;
 		} else {
-			mutex_lock_nested(&adap->bus_lock, adap->level);
+			rt_mutex_lock(&adap->bus_lock);
 		}
 
 		/* Retry automatically on arbitration loss */
@@ -1109,7 +1109,7 @@ int i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num)
 			if (time_after(jiffies, orig_jiffies + adap->timeout))
 				break;
 		}
-		mutex_unlock(&adap->bus_lock);
+		rt_mutex_unlock(&adap->bus_lock);
 
 		return ret;
 	} else {
@@ -1913,7 +1913,7 @@ s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr, unsigned short flags,
 	flags &= I2C_M_TEN | I2C_CLIENT_PEC;
 
 	if (adapter->algo->smbus_xfer) {
-		mutex_lock(&adapter->bus_lock);
+		rt_mutex_lock(&adapter->bus_lock);
 
 		/* Retry automatically on arbitration loss */
 		orig_jiffies = jiffies;
@@ -1927,7 +1927,7 @@ s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr, unsigned short flags,
 				       orig_jiffies + adapter->timeout))
 				break;
 		}
-		mutex_unlock(&adapter->bus_lock);
+		rt_mutex_unlock(&adapter->bus_lock);
 	} else
 		res = i2c_smbus_xfer_emulated(adapter,addr,flags,read_write,
 					      command, protocol, data);
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 7b40cda57a70..52317fb5917e 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -338,8 +338,7 @@ struct i2c_adapter {
 	void *algo_data;
 
 	/* data fields that are valid for all devices	*/
-	u8 level; 			/* nesting level for lockdep */
-	struct mutex bus_lock;
+	struct rt_mutex bus_lock;
 
 	int timeout;			/* in jiffies */
 	int retries;
@@ -367,7 +366,7 @@ static inline void i2c_set_adapdata(struct i2c_adapter *dev, void *data)
  */
 static inline void i2c_lock_adapter(struct i2c_adapter *adapter)
 {
-	mutex_lock(&adapter->bus_lock);
+	rt_mutex_lock(&adapter->bus_lock);
 }
 
 /**
@@ -376,7 +375,7 @@ static inline void i2c_lock_adapter(struct i2c_adapter *adapter)
  */
 static inline void i2c_unlock_adapter(struct i2c_adapter *adapter)
 {
-	mutex_unlock(&adapter->bus_lock);
+	rt_mutex_unlock(&adapter->bus_lock);
 }
 
 /*flags for the client struct: */
-- 
cgit v1.2.3


From c7b25a9e96dc89954ae8d8f473f56fae62030f84 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Sun, 6 Dec 2009 17:06:24 +0100
Subject: i2c: Drop probe, ignore and force module parameters

The legacy probe and force module parameters are obsolete now, the
same can be achieved using the new_device sysfs interface, which is
both more flexible and cheaper (it is implemented by i2c-core rather
than replicated in every driver module.)

The legacy ignore module parameters can be dropped as well. Ignoring
can be done by instantiating a "dummy" device at the problematic
address.

This is the first step of a huge cleanup to i2c-core's i2c_detect
function, i2c.h's I2C_CLIENT_INSMOD* macros, and all drivers that made
use of them.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 Documentation/i2c/old-module-parameters | 44 ++++++++++++++++
 drivers/i2c/i2c-core.c                  | 65 +----------------------
 include/linux/i2c.h                     | 91 +--------------------------------
 3 files changed, 46 insertions(+), 154 deletions(-)
 create mode 100644 Documentation/i2c/old-module-parameters

(limited to 'include/linux')

diff --git a/Documentation/i2c/old-module-parameters b/Documentation/i2c/old-module-parameters
new file mode 100644
index 000000000000..8e2b629d533c
--- /dev/null
+++ b/Documentation/i2c/old-module-parameters
@@ -0,0 +1,44 @@
+I2C device driver binding control from user-space
+=================================================
+
+Up to kernel 2.6.32, many i2c drivers used helper macros provided by
+<linux/i2c.h> which created standard module parameters to let the user
+control how the driver would probe i2c buses and attach to devices. These
+parameters were known as "probe" (to let the driver probe for an extra
+address), "force" (to forcibly attach the driver to a given device) and
+"ignore" (to prevent a driver from probing a given address).
+
+With the conversion of the i2c subsystem to the standard device driver
+binding model, it became clear that these per-module parameters were no
+longer needed, and that a centralized implementation was possible. The new,
+sysfs-based interface is described in the documentation file
+"instantiating-devices", section "Method 4: Instantiate from user-space".
+
+Below is a mapping from the old module parameters to the new interface.
+
+Attaching a driver to an I2C device
+-----------------------------------
+
+Old method (module parameters):
+# modprobe <driver> probe=1,0x2d
+# modprobe <driver> force=1,0x2d
+# modprobe <driver> force_<device>=1,0x2d
+
+New method (sysfs interface):
+# echo <device> 0x2d > /sys/bus/i2c/devices/i2c-1/new_device
+
+Preventing a driver from attaching to an I2C device
+---------------------------------------------------
+
+Old method (module parameters):
+# modprobe <driver> ignore=1,0x2f
+
+New method (sysfs interface):
+# echo dummy 0x2f > /sys/bus/i2c/devices/i2c-1/new_device
+# modprobe <driver>
+
+Of course, it is important to instantiate the "dummy" device before loading
+the driver. The dummy device will be handled by i2c-core itself, preventing
+other drivers from binding to it later on. If there is a real device at the
+problematic address, and you want another driver to bind to it, then simply
+pass the name of the device in question instead of "dummy".
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index d664b4a97a31..fdfaebdf3bfe 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -1259,40 +1259,13 @@ static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver)
 		return -ENOMEM;
 	temp_client->adapter = adapter;
 
-	/* Force entries are done first, and are not affected by ignore
-	   entries */
-	if (address_data->forces) {
-		const unsigned short * const *forces = address_data->forces;
-		int kind;
-
-		for (kind = 0; forces[kind]; kind++) {
-			for (i = 0; forces[kind][i] != I2C_CLIENT_END;
-			     i += 2) {
-				if (forces[kind][i] == adap_id
-				 || forces[kind][i] == ANY_I2C_BUS) {
-					dev_dbg(&adapter->dev, "found force "
-						"parameter for adapter %d, "
-						"addr 0x%02x, kind %d\n",
-						adap_id, forces[kind][i + 1],
-						kind);
-					temp_client->addr = forces[kind][i + 1];
-					err = i2c_detect_address(temp_client,
-						kind, driver);
-					if (err)
-						goto exit_free;
-				}
-			}
-		}
-	}
-
 	/* Stop here if the classes do not match */
 	if (!(adapter->class & driver->class))
 		goto exit_free;
 
 	/* Stop here if we can't use SMBUS_QUICK */
 	if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_QUICK)) {
-		if (address_data->probe[0] == I2C_CLIENT_END
-		 && address_data->normal_i2c[0] == I2C_CLIENT_END)
+		if (address_data->normal_i2c[0] == I2C_CLIENT_END)
 			goto exit_free;
 
 		dev_warn(&adapter->dev, "SMBus Quick command not supported, "
@@ -1301,43 +1274,7 @@ static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver)
 		goto exit_free;
 	}
 
-	/* Probe entries are done second, and are not affected by ignore
-	   entries either */
-	for (i = 0; address_data->probe[i] != I2C_CLIENT_END; i += 2) {
-		if (address_data->probe[i] == adap_id
-		 || address_data->probe[i] == ANY_I2C_BUS) {
-			dev_dbg(&adapter->dev, "found probe parameter for "
-				"adapter %d, addr 0x%02x\n", adap_id,
-				address_data->probe[i + 1]);
-			temp_client->addr = address_data->probe[i + 1];
-			err = i2c_detect_address(temp_client, -1, driver);
-			if (err)
-				goto exit_free;
-		}
-	}
-
-	/* Normal entries are done last, unless shadowed by an ignore entry */
 	for (i = 0; address_data->normal_i2c[i] != I2C_CLIENT_END; i += 1) {
-		int j, ignore;
-
-		ignore = 0;
-		for (j = 0; address_data->ignore[j] != I2C_CLIENT_END;
-		     j += 2) {
-			if ((address_data->ignore[j] == adap_id ||
-			     address_data->ignore[j] == ANY_I2C_BUS)
-			 && address_data->ignore[j + 1]
-			    == address_data->normal_i2c[i]) {
-				dev_dbg(&adapter->dev, "found ignore "
-					"parameter for adapter %d, "
-					"addr 0x%02x\n", adap_id,
-					address_data->ignore[j + 1]);
-				ignore = 1;
-				break;
-			}
-		}
-		if (ignore)
-			continue;
-
 		dev_dbg(&adapter->dev, "found normal entry for adapter %d, "
 			"addr 0x%02x\n", adap_id,
 			address_data->normal_i2c[i]);
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 52317fb5917e..419ab546b266 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -110,7 +110,7 @@ extern s32 i2c_smbus_write_i2c_block_data(struct i2c_client *client,
  * @driver: Device driver model driver
  * @id_table: List of I2C devices supported by this driver
  * @detect: Callback for device detection
- * @address_data: The I2C addresses to probe, ignore or force (for detect)
+ * @address_data: The I2C addresses to probe (for detect)
  * @clients: List of detected clients we created (for i2c-core use only)
  *
  * The driver.owner field should be set to the module owner of this driver.
@@ -397,9 +397,6 @@ static inline void i2c_unlock_adapter(struct i2c_adapter *adapter)
  */
 struct i2c_client_address_data {
 	const unsigned short *normal_i2c;
-	const unsigned short *probe;
-	const unsigned short *ignore;
-	const unsigned short * const *forces;
 };
 
 /* Internal numbers to terminate lists */
@@ -613,134 +610,48 @@ union i2c_smbus_data {
   module_param_array(var, short, &var##_num, 0); \
   MODULE_PARM_DESC(var, desc)
 
-#define I2C_CLIENT_MODULE_PARM_FORCE(name)				\
-I2C_CLIENT_MODULE_PARM(force_##name,					\
-		       "List of adapter,address pairs which are "	\
-		       "unquestionably assumed to contain a `"		\
-		       # name "' chip")
-
-
 #define I2C_CLIENT_INSMOD_COMMON					\
-I2C_CLIENT_MODULE_PARM(probe, "List of adapter,address pairs to scan "	\
-		       "additionally");					\
-I2C_CLIENT_MODULE_PARM(ignore, "List of adapter,address pairs not to "	\
-		       "scan");						\
 static const struct i2c_client_address_data addr_data = {		\
 	.normal_i2c	= normal_i2c,					\
-	.probe		= probe,					\
-	.ignore		= ignore,					\
-	.forces		= forces,					\
 }
 
-#define I2C_CLIENT_FORCE_TEXT \
-	"List of adapter,address pairs to boldly assume to be present"
-
 /* These are the ones you want to use in your own drivers. Pick the one
    which matches the number of devices the driver differenciates between. */
 #define I2C_CLIENT_INSMOD						\
-I2C_CLIENT_MODULE_PARM(force, I2C_CLIENT_FORCE_TEXT);			\
-static const unsigned short * const forces[] = { force, NULL };		\
 I2C_CLIENT_INSMOD_COMMON
 
 #define I2C_CLIENT_INSMOD_1(chip1)					\
 enum chips { any_chip, chip1 };						\
-I2C_CLIENT_MODULE_PARM(force, I2C_CLIENT_FORCE_TEXT);			\
-I2C_CLIENT_MODULE_PARM_FORCE(chip1);					\
-static const unsigned short * const forces[] =	{ force,		\
-	force_##chip1, NULL };						\
 I2C_CLIENT_INSMOD_COMMON
 
 #define I2C_CLIENT_INSMOD_2(chip1, chip2)				\
 enum chips { any_chip, chip1, chip2 };					\
-I2C_CLIENT_MODULE_PARM(force, I2C_CLIENT_FORCE_TEXT);			\
-I2C_CLIENT_MODULE_PARM_FORCE(chip1);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip2);					\
-static const unsigned short * const forces[] =	{ force,		\
-	force_##chip1, force_##chip2, NULL };				\
 I2C_CLIENT_INSMOD_COMMON
 
 #define I2C_CLIENT_INSMOD_3(chip1, chip2, chip3)			\
 enum chips { any_chip, chip1, chip2, chip3 };				\
-I2C_CLIENT_MODULE_PARM(force, I2C_CLIENT_FORCE_TEXT);			\
-I2C_CLIENT_MODULE_PARM_FORCE(chip1);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip2);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip3);					\
-static const unsigned short * const forces[] =	{ force,		\
-	force_##chip1, force_##chip2, force_##chip3, NULL };		\
 I2C_CLIENT_INSMOD_COMMON
 
 #define I2C_CLIENT_INSMOD_4(chip1, chip2, chip3, chip4)			\
 enum chips { any_chip, chip1, chip2, chip3, chip4 };			\
-I2C_CLIENT_MODULE_PARM(force, I2C_CLIENT_FORCE_TEXT);			\
-I2C_CLIENT_MODULE_PARM_FORCE(chip1);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip2);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip3);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip4);					\
-static const unsigned short * const forces[] =	{ force,		\
-	force_##chip1, force_##chip2, force_##chip3,			\
-	force_##chip4, NULL};						\
 I2C_CLIENT_INSMOD_COMMON
 
 #define I2C_CLIENT_INSMOD_5(chip1, chip2, chip3, chip4, chip5)		\
 enum chips { any_chip, chip1, chip2, chip3, chip4, chip5 };		\
-I2C_CLIENT_MODULE_PARM(force, I2C_CLIENT_FORCE_TEXT);			\
-I2C_CLIENT_MODULE_PARM_FORCE(chip1);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip2);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip3);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip4);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip5);					\
-static const unsigned short * const forces[] = { force,			\
-	force_##chip1, force_##chip2, force_##chip3,			\
-	force_##chip4, force_##chip5, NULL };				\
 I2C_CLIENT_INSMOD_COMMON
 
 #define I2C_CLIENT_INSMOD_6(chip1, chip2, chip3, chip4, chip5, chip6)	\
 enum chips { any_chip, chip1, chip2, chip3, chip4, chip5, chip6 };	\
-I2C_CLIENT_MODULE_PARM(force, I2C_CLIENT_FORCE_TEXT);			\
-I2C_CLIENT_MODULE_PARM_FORCE(chip1);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip2);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip3);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip4);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip5);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip6);					\
-static const unsigned short * const forces[] = { force,			\
-	force_##chip1, force_##chip2, force_##chip3,			\
-	force_##chip4, force_##chip5, force_##chip6, NULL };		\
 I2C_CLIENT_INSMOD_COMMON
 
 #define I2C_CLIENT_INSMOD_7(chip1, chip2, chip3, chip4, chip5, chip6, chip7) \
 enum chips { any_chip, chip1, chip2, chip3, chip4, chip5, chip6,	\
 	     chip7 };							\
-I2C_CLIENT_MODULE_PARM(force, I2C_CLIENT_FORCE_TEXT);			\
-I2C_CLIENT_MODULE_PARM_FORCE(chip1);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip2);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip3);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip4);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip5);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip6);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip7);					\
-static const unsigned short * const forces[] = { force,			\
-	force_##chip1, force_##chip2, force_##chip3,			\
-	force_##chip4, force_##chip5, force_##chip6,			\
-	force_##chip7, NULL };						\
 I2C_CLIENT_INSMOD_COMMON
 
 #define I2C_CLIENT_INSMOD_8(chip1, chip2, chip3, chip4, chip5, chip6, chip7, chip8) \
 enum chips { any_chip, chip1, chip2, chip3, chip4, chip5, chip6,	\
 	     chip7, chip8 };						\
-I2C_CLIENT_MODULE_PARM(force, I2C_CLIENT_FORCE_TEXT);			\
-I2C_CLIENT_MODULE_PARM_FORCE(chip1);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip2);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip3);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip4);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip5);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip6);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip7);					\
-I2C_CLIENT_MODULE_PARM_FORCE(chip8);					\
-static const unsigned short * const forces[] = { force,			\
-	force_##chip1, force_##chip2, force_##chip3,			\
-	force_##chip4, force_##chip5, force_##chip6,			\
-	force_##chip7, force_##chip8, NULL };				\
 I2C_CLIENT_INSMOD_COMMON
 #endif /* __KERNEL__ */
 #endif /* _LINUX_I2C_H */
-- 
cgit v1.2.3