rt: core implementation

Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
author: Ingo Molnar <mingo@elte.hu> 2009-07-03 08:30:07 -0500
committer: Thomas Gleixner <tglx@linutronix.de> 2009-07-29 23:30:38 +0200
commit: e9888fb95225bb3b786d79fd983eb67e1acad338 (patch)
tree: d09bc138fec534e17b57a559f36a5d8c7e49973f
parent: 94b3cbf2548a023b4187e252043eac367f84740c (diff)
download: lwn-e9888fb95225bb3b786d79fd983eb67e1acad338.tar.gz
lwn-e9888fb95225bb3b786d79fd983eb67e1acad338.zip
19 files changed, 244 insertions, 72 deletions
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 70b12547cfb2..16966fbab185 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -77,9 +77,9 @@
  * Are we doing bottom half or hardware interrupt processing?
  * Are we in a softirq context? Interrupt context?
  */
-#define in_irq()		(hardirq_count())
-#define in_softirq()		(softirq_count())
-#define in_interrupt()		(irq_count())
+#define in_irq()	(hardirq_count() || (current->flags & PF_HARDIRQ))
+#define in_softirq()	(softirq_count() || (current->flags & PF_SOFTIRQ))
+#define in_interrupt()	(irq_count())
 
 /*
  * Are we in NMI context?
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index d6320a3e8def..4651e0971d75 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -124,7 +124,7 @@ extern int _cond_resched(void);
 # define might_resched() do { } while (0)
 #endif
 
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT)
   void __might_sleep(char *file, int line);
 /**
  * might_sleep - annotation for functions that can sleep
@@ -284,6 +284,12 @@ extern void printk_tick(void);
 extern void asmlinkage __attribute__((format(printf, 1, 2)))
 	early_printk(const char *fmt, ...);
 
+#ifdef CONFIG_PREEMPT_RT
+extern void zap_rt_locks(void);
+#else
+# define zap_rt_locks() do { } while (0)
+#endif
+
 unsigned long int_sqrt(unsigned long);
 
 static inline void console_silent(void)
@@ -313,6 +319,7 @@ extern int root_mountflags;
 /* Values used for system_state */
 extern enum system_states {
 	SYSTEM_BOOTING,
+	SYSTEM_BOOTING_SCHEDULER_OK,
 	SYSTEM_RUNNING,
 	SYSTEM_HALT,
 	SYSTEM_POWER_OFF,
diff --git a/include/linux/profile.h b/include/linux/profile.h
index a0fc32279fc0..5b72082c273e 100644
--- a/include/linux/profile.h
+++ b/include/linux/profile.h
@@ -8,10 +8,11 @@
 
 #include <asm/errno.h>
 
-#define CPU_PROFILING	1
-#define SCHED_PROFILING	2
-#define SLEEP_PROFILING	3
-#define KVM_PROFILING	4
+#define CPU_PROFILING		1
+#define SCHED_PROFILING		2
+#define SLEEP_PROFILING		3
+#define KVM_PROFILING		4
+#define PREEMPT_PROFILING	5
 
 struct proc_dir_entry;
 struct pt_regs;
@@ -36,6 +37,8 @@ enum profile_type {
 	PROFILE_MUNMAP
 };
 
+extern int prof_pid;
+
 #ifdef CONFIG_PROFILING
 
 extern int prof_on __read_mostly;
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index c5da74918096..9eb17f95857b 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -169,7 +169,18 @@ unsigned long radix_tree_next_hole(struct radix_tree_root *root,
 				unsigned long index, unsigned long max_scan);
 unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
 				unsigned long index, unsigned long max_scan);
+/*
+ * On a mutex based kernel we can freely schedule within the radix code:
+ */
+#ifdef CONFIG_PREEMPT_RT
+static inline int radix_tree_preload(gfp_t gfp_mask)
+{
+	return 0;
+}
+#else
 int radix_tree_preload(gfp_t gfp_mask);
+#endif
+
 void radix_tree_init(void);
 void *radix_tree_tag_set(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag);
@@ -189,7 +200,9 @@ int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
 
 static inline void radix_tree_preload_end(void)
 {
+#ifndef CONFIG_PREEMPT_RT
 	preempt_enable();
+#endif
 }
 
 #endif /* _LINUX_RADIX_TREE_H */
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 9e3d8af09207..378005121d60 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -50,6 +50,16 @@ extern void smp_send_stop(void);
  */
 extern void smp_send_reschedule(int cpu);
 
+/*
+ * trigger a reschedule on all other CPUs:
+ */
+extern void smp_send_reschedule_allbutself(void);
+
+/*
+ * trigger a reschedule on all other CPUs:
+ */
+extern void smp_send_reschedule_allbutself(void);
+
 
 /*
  * Prepare machine for booting other CPUs.
@@ -142,6 +152,7 @@ static inline int up_smp_call_function(void (*func)(void *), void *info)
 		0;				\
 	})
 static inline void smp_send_reschedule(int cpu) { }
+static inline void smp_send_reschedule_allbutself(void) { }
 #define num_booting_cpus()			1
 #define smp_prepare_boot_cpu()			do {} while (0)
 #define smp_call_function_mask(mask, func, info, wait) \
diff --git a/include/linux/smp_lock.h b/include/linux/smp_lock.h
index 813be59bf345..0cb3cf9a68ef 100644
--- a/include/linux/smp_lock.h
+++ b/include/linux/smp_lock.h
@@ -45,7 +45,7 @@ static inline void cycle_kernel_lock(void)
 #define unlock_kernel()				do { } while(0)
 #define release_kernel_lock(task)		do { } while(0)
 #define cycle_kernel_lock()			do { } while(0)
-#define reacquire_kernel_lock(task)		0
+#define reacquire_kernel_lock(task)		do { } while(0)
 #define kernel_locked()				1
 
 #endif /* CONFIG_LOCK_KERNEL */
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 13e1adf55c4c..3f363b7168c4 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -195,6 +195,9 @@ __create_workqueue_key(const char *name, int singlethread,
 #define create_freezeable_workqueue(name) __create_workqueue((name), 1, 1, 0)
 #define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0, 0)
 
+extern void set_workqueue_prio(struct workqueue_struct *wq, int policy,
+			       int rt_priority, int nice);
+
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
 extern int queue_work(struct workqueue_struct *wq, struct work_struct *work);
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 4bb60418779f..f4602f8f35d4 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,14 +1,13 @@
-
 choice
-	prompt "Preemption Model"
-	default PREEMPT_NONE
+	prompt "Preemption Mode"
+	default PREEMPT_RT
 
 config PREEMPT_NONE
 	bool "No Forced Preemption (Server)"
 	help
-	  This is the traditional Linux preemption model, geared towards
+	  This is the traditional Linux preemption model geared towards
 	  throughput. It will still provide good latencies most of the
-	  time, but there are no guarantees and occasional longer delays
+	  time but there are no guarantees and occasional long delays
 	  are possible.
 
 	  Select this option if you are building a kernel for a server or
@@ -21,7 +20,7 @@ config PREEMPT_VOLUNTARY
 	help
 	  This option reduces the latency of the kernel by adding more
 	  "explicit preemption points" to the kernel code. These new
-	  preemption points have been selected to reduce the maximum
+	  preemption points have been selected to minimize the maximum
 	  latency of rescheduling, providing faster application reactions,
 	  at the cost of slightly lower throughput.
 
@@ -33,38 +32,73 @@ config PREEMPT_VOLUNTARY
 
 	  Select this if you are building a kernel for a desktop system.
 
-config PREEMPT
+config PREEMPT_DESKTOP
 	bool "Preemptible Kernel (Low-Latency Desktop)"
 	help
 	  This option reduces the latency of the kernel by making
-	  all kernel code (that is not executing in a critical section)
+	  all kernel code that is not executing in a critical section
 	  preemptible.  This allows reaction to interactive events by
 	  permitting a low priority process to be preempted involuntarily
 	  even if it is in kernel mode executing a system call and would
-	  otherwise not be about to reach a natural preemption point.
-	  This allows applications to run more 'smoothly' even when the
-	  system is under load, at the cost of slightly lower throughput
-	  and a slight runtime overhead to kernel code.
+	  otherwise not about to reach a preemption point.  This allows
+	  applications to run more 'smoothly' even when the system is
+	  under load, at the cost of slighly lower throughput and a
+	  slight runtime overhead to kernel code.
+
+	  (According to profiles, when this mode is selected then even
+	  during kernel-intense workloads the system is in an immediately
+	  preemptible state more than 50% of the time.)
 
 	  Select this if you are building a kernel for a desktop or
 	  embedded system with latency requirements in the milliseconds
 	  range.
 
+config PREEMPT_RT
+	bool "Complete Preemption (Real-Time)"
+	select PREEMPT_SOFTIRQS
+	select PREEMPT_HARDIRQS
+	select PREEMPT_RCU
+	select RT_MUTEXES
+	help
+	  This option further reduces the scheduling latency of the
+	  kernel by replacing almost every spinlock used by the kernel
+	  with preemptible mutexes and thus making all but the most
+	  critical kernel code involuntarily preemptible. The remaining
+	  handful of lowlevel non-preemptible codepaths are short and
+	  have a deterministic latency of a couple of tens of
+	  microseconds (depending on the hardware).  This also allows
+	  applications to run more 'smoothly' even when the system is
+	  under load, at the cost of lower throughput and runtime
+	  overhead to kernel code.
+
+	  (According to profiles, when this mode is selected then even
+	  during kernel-intense workloads the system is in an immediately
+	  preemptible state more than 95% of the time.)
+
+	  Select this if you are building a kernel for a desktop,
+	  embedded or real-time system with guaranteed latency
+	  requirements of 100 usecs or lower.
+
 endchoice
 
+config PREEMPT
+	bool
+	default y
+	depends on PREEMPT_DESKTOP || PREEMPT_RT
+
 config PREEMPT_SOFTIRQS
 	bool "Thread Softirqs"
 	default n
 #	depends on PREEMPT
 	help
 	  This option reduces the latency of the kernel by 'threading'
-	  soft interrupts. This means that all softirqs will execute
-	  in softirqd's context. While this helps latency, it can also
-	  reduce performance.
+          soft interrupts. This means that all softirqs will execute
+          in softirqd's context. While this helps latency, it can also
+          reduce performance.
 
-	  The threading of softirqs can also be controlled via
-	  /proc/sys/kernel/softirq_preemption runtime flag and the
-	  sofirq-preempt=0/1 boot-time option.
+          The threading of softirqs can also be controlled via
+          /proc/sys/kernel/softirq_preemption runtime flag and the
+          sofirq-preempt=0/1 boot-time option.
 
 	  Say N if you are unsure.
 
@@ -75,15 +109,14 @@ config PREEMPT_HARDIRQS
 	select PREEMPT_SOFTIRQS
 	help
 	  This option reduces the latency of the kernel by 'threading'
-	  hardirqs. This means that all (or selected) hardirqs will run
-	  in their own kernel thread context. While this helps latency,
-	  this feature can also reduce performance.
+          hardirqs. This means that all (or selected) hardirqs will run
+          in their own kernel thread context. While this helps latency,
+          this feature can also reduce performance.
 
-	  The threading of hardirqs can also be controlled via the
-	  /proc/sys/kernel/hardirq_preemption runtime flag and the
-	  hardirq-preempt=0/1 boot-time option. Per-irq threading can
-	  be enabled/disable via the /proc/irq/<IRQ>/<handler>/threaded
-	  runtime flags.
+          The threading of hardirqs can also be controlled via the
+          /proc/sys/kernel/hardirq_preemption runtime flag and the
+          hardirq-preempt=0/1 boot-time option. Per-irq threading can
+          be enabled/disable via the /proc/irq/<IRQ>/<handler>/threaded
+          runtime flags.
 
 	  Say N if you are unsure.
-
diff --git a/kernel/exit.c b/kernel/exit.c
index a27d47d3d0c8..4441e623e671 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -67,7 +67,9 @@ static void __unhash_process(struct task_struct *p)
 		detach_pid(p, PIDTYPE_SID);
 
 		list_del_rcu(&p->tasks);
+		preempt_disable();
 		__get_cpu_var(process_counts)--;
+		preempt_enable();
 	}
 	list_del_rcu(&p->thread_group);
 	list_del_init(&p->sibling);
@@ -685,9 +687,11 @@ static void exit_mm(struct task_struct * tsk)
 	task_lock(tsk);
 	tsk->mm = NULL;
 	up_read(&mm->mmap_sem);
+	preempt_disable(); // FIXME
 	enter_lazy_tlb(mm, current);
 	/* We don't want this task to be frozen prematurely */
 	clear_freeze_flag(tsk);
+	preempt_enable();
 	task_unlock(tsk);
 	mm_update_next_owner(mm);
 	mmput(mm);
@@ -1009,14 +1013,17 @@ NORET_TYPE void do_exit(long code)
 	if (tsk->splice_pipe)
 		__free_pipe_info(tsk->splice_pipe);
 
-	preempt_disable();
+again:
+	local_irq_disable();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
-	schedule();
-	BUG();
-	/* Avoid "noreturn function does return".  */
-	for (;;)
-		cpu_relax();	/* For when BUG is null */
+	__schedule();
+	printk(KERN_ERR "BUG: dead task %s:%d back from the grave!\n",
+		current->comm, current->pid);
+	printk(KERN_ERR ".... flags: %08x, count: %d, state: %08lx\n",
+		current->flags, atomic_read(&current->usage), current->state);
+	printk(KERN_ERR ".... trying again ...\n");
+	goto again;
 }
 
 EXPORT_SYMBOL_GPL(do_exit);
@@ -1476,6 +1483,9 @@ static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent,
 				int ptrace, struct task_struct *p)
 {
 	int ret = eligible_child(wo, p);
+
+	BUG_ON(!atomic_read(&p->usage));
+
 	if (!ret)
 		return ret;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 7a35caaef037..b013c7ed4b5a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -175,6 +175,16 @@ void __put_task_struct(struct task_struct *tsk)
 		free_task(tsk);
 }
 
+#ifdef CONFIG_PREEMPT_RT
+void __put_task_struct_cb(struct rcu_head *rhp)
+{
+	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+
+	__put_task_struct(tsk);
+
+}
+#endif
+
 /*
  * macro override instead of weak attribute alias, to workaround
  * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions.
@@ -1235,11 +1245,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	 * to ensure it is on a valid CPU (and if not, just force it back to
 	 * parent's CPU). This avoids alot of nasty races.
 	 */
+	preempt_disable();
 	p->cpus_allowed = current->cpus_allowed;
 	p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
 	if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
 			!cpu_online(task_cpu(p))))
 		set_task_cpu(p, smp_processor_id());
+	preempt_enable();
 
 	/* CLONE_PARENT re-uses the old parent */
 	if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 61d5aa5eced3..cf40c2d9817f 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -71,7 +71,7 @@ static int notifier_chain_unregister(struct notifier_block **nl,
  *	@returns:	notifier_call_chain returns the value returned by the
  *			last notifier function called.
  */
-static int __kprobes notifier_call_chain(struct notifier_block **nl,
+static int __kprobes notrace notifier_call_chain(struct notifier_block **nl,
 					unsigned long val, void *v,
 					int nr_to_call,	int *nr_calls)
 {
@@ -217,7 +217,7 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
 	 * not yet working and interrupts must remain disabled.  At
 	 * such times we must not call down_write().
 	 */
-	if (unlikely(system_state == SYSTEM_BOOTING))
+	if (unlikely(system_state < SYSTEM_RUNNING))
 		return notifier_chain_register(&nh->head, n);
 
 	down_write(&nh->rwsem);
diff --git a/kernel/signal.c b/kernel/signal.c
index b1889c55fd53..3dd14285faef 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -916,8 +916,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 
 	trace_sched_signal_send(sig, t);
 
+#ifdef CONFIG_SMP
 	assert_spin_locked(&t->sighand->siglock);
-
+#endif
 	if (!prepare_signal(sig, t, from_ancestor_ns))
 		return 0;
 
@@ -1692,15 +1693,8 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
 	read_lock(&tasklist_lock);
 	if (may_ptrace_stop()) {
 		do_notify_parent_cldstop(current, CLD_TRAPPED);
-		/*
-		 * Don't want to allow preemption here, because
-		 * sys_ptrace() needs this task to be inactive.
-		 *
-		 * XXX: implement read_unlock_no_resched().
-		 */
-		preempt_disable();
 		read_unlock(&tasklist_lock);
-		preempt_enable_and_schedule();
+		schedule();
 	} else {
 		/*
 		 * By the time we got the lock, our tracer went away.
diff --git a/kernel/softirq.c b/kernel/softirq.c
index e53eb38bab84..87821717bb3c 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -20,6 +20,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
+#include <linux/delay.h>
 #include <linux/mm.h>
 #include <linux/notifier.h>
 #include <linux/percpu.h>
@@ -106,6 +107,8 @@ static void trigger_softirqs(void)
 	}
 }
 
+#ifndef CONFIG_PREEMPT_RT
+
 /*
  * This one is for softirq.c-internal use,
  * where hardirqs are disabled legitimately:
@@ -207,6 +210,8 @@ void local_bh_enable_ip(unsigned long ip)
 }
 EXPORT_SYMBOL(local_bh_enable_ip);
 
+#endif
+
 /*
  * We restart softirq processing MAX_SOFTIRQ_RESTART times,
  * and we fall back to softirqd after that.
@@ -606,7 +611,7 @@ void tasklet_kill(struct tasklet_struct *t)
 
 	while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
 		do {
-			yield();
+			msleep(1);
 		} while (test_bit(TASKLET_STATE_SCHED, &t->state));
 	}
 	tasklet_unlock_wait(t);
@@ -1064,6 +1069,11 @@ int softirq_preemption = 1;
 
 EXPORT_SYMBOL(softirq_preemption);
 
+/*
+ * Real-Time Preemption depends on softirq threading:
+ */
+#ifndef CONFIG_PREEMPT_RT
+
 static int __init softirq_preempt_setup (char *str)
 {
 	if (!strncmp(str, "off", 3))
@@ -1077,7 +1087,7 @@ static int __init softirq_preempt_setup (char *str)
 }
 
 __setup("softirq-preempt=", softirq_preempt_setup);
-
+#endif
 #endif
 
 #ifdef CONFIG_SMP
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0668795d8818..d3f9b451f289 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
+#include <linux/syscalls.h>
 #include <linux/kthread.h>
 #include <linux/hardirq.h>
 #include <linux/mempolicy.h>
@@ -36,6 +37,8 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
 
+#include <asm/uaccess.h>
+
 /*
  * The per-CPU workqueue (if single thread, we always use the first
  * possible cpu).
@@ -159,13 +162,14 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
  *
  * We queue the work to the CPU on which it was submitted, but if the CPU dies
  * it can be processed by another CPU.
+ *
+ * Especially no such guarantee on PREEMPT_RT.
  */
 int queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
-	int ret;
+	int ret = 0, cpu = raw_smp_processor_id();
 
-	ret = queue_work_on(get_cpu(), wq, work);
-	put_cpu();
+	ret = queue_work_on(cpu, wq, work);
 
 	return ret;
 }
@@ -883,6 +887,49 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
 	cwq->thread = NULL;
 }
 
+void set_workqueue_thread_prio(struct workqueue_struct *wq, int cpu,
+			       int policy, int rt_priority, int nice)
+{
+	struct sched_param param = { .sched_priority = rt_priority };
+	struct cpu_workqueue_struct *cwq;
+	mm_segment_t oldfs = get_fs();
+	struct task_struct *p;
+	unsigned long flags;
+	int ret;
+
+	cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+	spin_lock_irqsave(&cwq->lock, flags);
+	p = cwq->thread;
+	spin_unlock_irqrestore(&cwq->lock, flags);
+
+	set_user_nice(p, nice);
+
+	set_fs(KERNEL_DS);
+	ret = sys_sched_setscheduler(p->pid, policy, &param);
+	set_fs(oldfs);
+
+	WARN_ON(ret);
+}
+
+void set_workqueue_prio(struct workqueue_struct *wq, int policy,
+			int rt_priority, int nice)
+{
+	int cpu;
+
+	/* We don't need the distraction of CPUs appearing and vanishing. */
+	get_online_cpus();
+	spin_lock(&workqueue_lock);
+	if (is_wq_single_threaded(wq))
+		set_workqueue_thread_prio(wq, 0, policy, rt_priority, nice);
+	else {
+		for_each_online_cpu(cpu)
+			set_workqueue_thread_prio(wq, cpu, policy,
+						  rt_priority, nice);
+	}
+	spin_unlock(&workqueue_lock);
+	put_online_cpus();
+}
+
 /**
  * destroy_workqueue - safely terminate a workqueue
  * @wq: target workqueue
@@ -1015,4 +1062,5 @@ void __init init_workqueues(void)
 	hotcpu_notifier(workqueue_cpu_callback, 0);
 	keventd_wq = create_workqueue("events");
 	BUG_ON(!keventd_wq);
+	set_workqueue_prio(keventd_wq, SCHED_FIFO, 1, -20);
 }
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 12327b2bb785..8a2ddd3f0922 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -397,6 +397,8 @@ config DEBUG_RT_MUTEXES
 	help
 	 This allows rt mutex semantics violations and rt mutex related
 	 deadlocks (lockups) to be detected and reported automatically.
+	 When realtime preemption is enabled this includes spinlocks,
+	 rwlocks, mutexes and (rw)semaphores
 
 config DEBUG_PI_LIST
 	bool
@@ -420,7 +422,7 @@ config DEBUG_SPINLOCK
 
 config DEBUG_MUTEXES
 	bool "Mutex debugging: basic checks"
-	depends on DEBUG_KERNEL
+	depends on DEBUG_KERNEL && !PREEMPT_RT
 	help
 	 This feature allows mutex semantics violations to be detected and
 	 reported.
diff --git a/lib/Makefile b/lib/Makefile
index b6d1857bbf08..0d2ee155f4c9 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -34,7 +34,8 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o
 obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o
 obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
-lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
+obj-$(CONFIG_PREEMPT_RT) += plist.o
+obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_GENERIC_FIND_FIRST_BIT) += find_next_bit.o
 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c
index 54625bec6fb9..709c432b8a92 100644
--- a/lib/kernel_lock.c
+++ b/lib/kernel_lock.c
@@ -35,6 +35,8 @@ DEFINE_SEMAPHORE(kernel_sem);
  * about recursion, both due to the down() and due to the enabling of
  * preemption. schedule() will re-check the preemption flag after
  * reacquiring the semaphore.
+ *
+ * Called with interrupts disabled.
  */
 int __lockfunc __reacquire_kernel_lock(void)
 {
@@ -67,11 +69,15 @@ void __lockfunc lock_kernel(void)
 	struct task_struct *task = current;
 	int depth = task->lock_depth + 1;
 
-	if (likely(!depth))
+	if (likely(!depth)) {
 		/*
 		 * No recursion worries - we set up lock_depth _after_
 		 */
 		down(&kernel_sem);
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+		current->last_kernel_lock = __builtin_return_address(0);
+#endif
+	}
 
 	task->lock_depth = depth;
 }
@@ -82,8 +88,12 @@ void __lockfunc unlock_kernel(void)
 
 	BUG_ON(task->lock_depth < 0);
 
-	if (likely(--task->lock_depth < 0))
+	if (likely(--task->lock_depth < 0)) {
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+		current->last_kernel_lock = NULL;
+#endif
 		up(&kernel_sem);
+	}
 }
 
 EXPORT_SYMBOL(lock_kernel);
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 619313ed6c46..65e7eab8498e 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -158,7 +158,7 @@ static void init_shared_classes(void)
 		local_bh_disable();		\
 		local_irq_disable();		\
 		lockdep_softirq_enter();	\
-		WARN_ON(!in_softirq());
+		/* FIXME: preemptible softirqs. WARN_ON(!in_softirq()); */
 
 #define SOFTIRQ_EXIT()				\
 		lockdep_softirq_exit();		\
@@ -550,6 +550,11 @@ GENERATE_TESTCASE(init_held_rsem)
 #undef E
 
 /*
+ * FIXME: turns these into raw-spinlock tests on -rt
+ */
+#ifndef CONFIG_PREEMPT_RT
+
+/*
  * locking an irq-safe lock with irqs enabled:
  */
 #define E1()				\
@@ -890,6 +895,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
 #include "locking-selftest-softirq.h"
 // GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft)
 
+#endif /* !CONFIG_PREEMPT_RT */
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define I_SPINLOCK(x)	lockdep_reset_lock(&lock_##x.dep_map)
 # define I_RWLOCK(x)	lockdep_reset_lock(&rwlock_##x.dep_map)
@@ -998,7 +1005,7 @@ static inline void print_testname(const char *testname)
 
 #define DO_TESTCASE_1(desc, name, nr)				\
 	print_testname(desc"/"#nr);				\
-	dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK);		\
+	dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK);	\
 	printk("\n");
 
 #define DO_TESTCASE_1B(desc, name, nr)				\
@@ -1006,17 +1013,17 @@ static inline void print_testname(const char *testname)
 	dotest(name##_##nr, FAILURE, LOCKTYPE_RWLOCK);		\
 	printk("\n");
 
-#define DO_TESTCASE_3(desc, name, nr)				\
-	print_testname(desc"/"#nr);				\
-	dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN);	\
-	dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK);	\
+#define DO_TESTCASE_3(desc, name, nr)					\
+	print_testname(desc"/"#nr);					\
+	dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN);		\
+	dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK);		\
 	dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK);	\
 	printk("\n");
 
-#define DO_TESTCASE_3RW(desc, name, nr)				\
-	print_testname(desc"/"#nr);				\
+#define DO_TESTCASE_3RW(desc, name, nr)					\
+	print_testname(desc"/"#nr);					\
 	dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN|LOCKTYPE_RWLOCK);\
-	dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK);	\
+	dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK);		\
 	dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK);	\
 	printk("\n");
 
@@ -1047,7 +1054,7 @@ static inline void print_testname(const char *testname)
 	print_testname(desc);					\
 	dotest(name##_spin, FAILURE, LOCKTYPE_SPIN);		\
 	dotest(name##_wlock, FAILURE, LOCKTYPE_RWLOCK);		\
-	dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK);		\
+	dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK);	\
 	dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX);		\
 	dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM);		\
 	dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM);		\
@@ -1179,6 +1186,7 @@ void locking_selftest(void)
 	/*
 	 * irq-context testcases:
 	 */
+#ifndef CONFIG_PREEMPT_RT
 	DO_TESTCASE_2x6("irqs-on + irq-safe-A", irqsafe1);
 	DO_TESTCASE_2x3("sirq-safe-A => hirqs-on", irqsafe2A);
 	DO_TESTCASE_2x6("safe-A + irqs-on", irqsafe2B);
@@ -1188,6 +1196,7 @@ void locking_selftest(void)
 
 	DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
 //	DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
+#endif
 
 	if (unexpected_testcase_failures) {
 		printk("-----------------------------------------------------------------\n");
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 23abbd93cae1..e209012e5d31 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -157,12 +157,14 @@ radix_tree_node_alloc(struct radix_tree_root *root)
 		 * succeed in getting a node here (and never reach
 		 * kmem_cache_alloc)
 		 */
+		rtp = &get_cpu_var(radix_tree_preloads);
 		rtp = &__get_cpu_var(radix_tree_preloads);
 		if (rtp->nr) {
 			ret = rtp->nodes[rtp->nr - 1];
 			rtp->nodes[rtp->nr - 1] = NULL;
 			rtp->nr--;
 		}
+		put_cpu_var(radix_tree_preloads);
 	}
 	if (ret == NULL)
 		ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
@@ -195,6 +197,8 @@ radix_tree_node_free(struct radix_tree_node *node)
 	call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
 }
 
+#ifndef CONFIG_PREEMPT_RT
+
 /*
  * Load up this CPU's radix_tree_node buffer with sufficient objects to
  * ensure that the addition of a single element in the tree cannot fail.  On
@@ -227,6 +231,8 @@ out:
 }
 EXPORT_SYMBOL(radix_tree_preload);
 
+#endif
+
 /*
  *	Return the maximum key which can be store into a
  *	radix tree with height HEIGHT.
author	Ingo Molnar <mingo@elte.hu>	2009-07-03 08:30:07 -0500
committer	Thomas Gleixner <tglx@linutronix.de>	2009-07-29 23:30:38 +0200
commit	e9888fb95225bb3b786d79fd983eb67e1acad338 (patch)
tree	d09bc138fec534e17b57a559f36a5d8c7e49973f
parent	94b3cbf2548a023b4187e252043eac367f84740c (diff)
download	lwn-e9888fb95225bb3b786d79fd983eb67e1acad338.tar.gz lwn-e9888fb95225bb3b786d79fd983eb67e1acad338.zip