summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-07-03 08:30:03 -0500
committerThomas Gleixner <tglx@linutronix.de>2009-07-29 23:30:37 +0200
commit0a930ce98838ed0a03530fd4960eb3423c9b55bc (patch)
tree660e743d49ecbb04a6bd08b577720b86de327f7d
parent42cd561b099de734b16c92b7e29f418f0d62daad (diff)
downloadlwn-0a930ce98838ed0a03530fd4960eb3423c9b55bc.tar.gz
lwn-0a930ce98838ed0a03530fd4960eb3423c9b55bc.zip
sched: preempt-rt support
Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-rw-r--r--include/linux/hardirq.h13
-rw-r--r--include/linux/sched.h49
-rw-r--r--kernel/mutex.c7
-rw-r--r--kernel/sched.c337
-rw-r--r--kernel/sched_rt.c54
-rw-r--r--lib/kernel_lock.c4
6 files changed, 381 insertions, 83 deletions
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 1518625411a4..70b12547cfb2 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -96,19 +96,6 @@
#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0)
#ifdef CONFIG_PREEMPT
-# define PREEMPT_CHECK_OFFSET 1
-#else
-# define PREEMPT_CHECK_OFFSET 0
-#endif
-
-/*
- * Check whether we were atomic before we did preempt_disable():
- * (used by the scheduler)
- */
-#define in_atomic_preempt_off() \
- ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
-
-#ifdef CONFIG_PREEMPT
# define preemptible() (preempt_count() == 0 && !irqs_disabled())
# define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1)
#else
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 69faf651b10a..2e5be662deef 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -100,6 +100,17 @@ struct fs_struct;
struct bts_context;
struct perf_counter_context;
+#ifdef CONFIG_PREEMPT
+extern int kernel_preemption;
+#else
+# define kernel_preemption 0
+#endif
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+extern int voluntary_preemption;
+#else
+# define voluntary_preemption 0
+#endif
+
#ifdef CONFIG_PREEMPT_SOFTIRQS
extern int softirq_preemption;
#else
@@ -225,6 +236,28 @@ extern struct semaphore kernel_sem;
#define set_task_state(tsk, state_value) \
set_mb((tsk)->state, (state_value))
+// #define PREEMPT_DIRECT
+
+#ifdef CONFIG_X86_LOCAL_APIC
+extern void nmi_show_all_regs(void);
+#else
+# define nmi_show_all_regs() do { } while (0)
+#endif
+
+#include <linux/smp.h>
+#include <linux/sem.h>
+#include <linux/signal.h>
+#include <linux/securebits.h>
+#include <linux/fs_struct.h>
+#include <linux/compiler.h>
+#include <linux/completion.h>
+#include <linux/pid.h>
+#include <linux/percpu.h>
+#include <linux/topology.h>
+#include <linux/seccomp.h>
+
+struct exec_domain;
+
/*
* set_current_state() includes a barrier so that the write of current->state
* is correctly serialised wrt the caller's subsequent test of whether to
@@ -354,6 +387,11 @@ extern signed long schedule_timeout_uninterruptible(signed long timeout);
asmlinkage void __schedule(void);
asmlinkage void schedule(void);
extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner);
+/*
+ * This one can be called with interrupts disabled, only
+ * to be used by lowlevel arch code!
+ */
+asmlinkage void __sched __schedule(void);
struct nsproxy;
struct user_namespace;
@@ -1686,6 +1724,15 @@ extern struct pid *cad_pid;
extern void free_task(struct task_struct *tsk);
#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
+#ifdef CONFIG_PREEMPT_RT
+extern void __put_task_struct_cb(struct rcu_head *rhp);
+
+static inline void put_task_struct(struct task_struct *t)
+{
+ if (atomic_dec_and_test(&t->usage))
+ call_rcu(&t->rcu, __put_task_struct_cb);
+}
+#else
extern void __put_task_struct(struct task_struct *t);
static inline void put_task_struct(struct task_struct *t)
@@ -1693,6 +1740,7 @@ static inline void put_task_struct(struct task_struct *t)
if (atomic_dec_and_test(&t->usage))
__put_task_struct(t);
}
+#endif
extern cputime_t task_utime(struct task_struct *p);
extern cputime_t task_stime(struct task_struct *p);
@@ -1910,6 +1958,7 @@ extern struct task_struct *curr_task(int cpu);
extern void set_curr_task(int cpu, struct task_struct *p);
void yield(void);
+void __yield(void);
/*
* The default (Linux) execution domain.
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 3714ee5bc638..73ad8a627e36 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -249,8 +249,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
/* didnt get the lock, go to sleep: */
spin_unlock_mutex(&lock->wait_lock, flags);
- preempt_enable_and_schedule();
+
+ local_irq_disable();
+ __preempt_enable_no_resched();
+ __schedule();
preempt_disable();
+ local_irq_enable();
+
spin_lock_mutex(&lock->wait_lock, flags);
}
diff --git a/kernel/sched.c b/kernel/sched.c
index 7de11ead31ca..eac72fee3546 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4,6 +4,7 @@
* Kernel scheduler and related syscalls
*
* Copyright (C) 1991-2002 Linus Torvalds
+ * Copyright (C) 2004 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*
* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
* make semaphores SMP safe
@@ -16,6 +17,7 @@
* by Davide Libenzi, preemptible kernel bits by Robert Love.
* 2003-09-03 Interactivity tuning by Con Kolivas.
* 2004-04-02 Scheduler domains code by Nick Piggin
+ * 2004-10-13 Real-Time Preemption support by Ingo Molnar
* 2007-04-15 Work begun on replacing all interactivity tuning with a
* fair scheduling design by Con Kolivas.
* 2007-05-05 Load balancing (smp-nice) and other improvements
@@ -61,6 +63,7 @@
#include <linux/sysctl.h>
#include <linux/syscalls.h>
#include <linux/times.h>
+#include <linux/kallsyms.h>
#include <linux/tsacct_kern.h>
#include <linux/kprobes.h>
#include <linux/delayacct.h>
@@ -107,6 +110,20 @@
#define NICE_0_LOAD SCHED_LOAD_SCALE
#define NICE_0_SHIFT SCHED_LOAD_SHIFT
+#if (BITS_PER_LONG < 64)
+#define JIFFIES_TO_NS64(TIME) \
+ ((unsigned long long)(TIME) * ((unsigned long) (1000000000 / HZ)))
+
+#define NS64_TO_JIFFIES(TIME) \
+ ((((unsigned long long)((TIME)) >> BITS_PER_LONG) * \
+ (1 + NS_TO_JIFFIES(~0UL))) + NS_TO_JIFFIES((unsigned long)(TIME)))
+#else /* BITS_PER_LONG < 64 */
+
+#define NS64_TO_JIFFIES(TIME) NS_TO_JIFFIES(TIME)
+#define JIFFIES_TO_NS64(TIME) JIFFIES_TO_NS(TIME)
+
+#endif /* BITS_PER_LONG < 64 */
+
/*
* These are the 'tuning knobs' of the scheduler:
*
@@ -144,6 +161,32 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
}
#endif
+#define TASK_PREEMPTS_CURR(p, rq) \
+ ((p)->prio < (rq)->curr->prio)
+
+/*
+ * Tweaks for current
+ */
+
+#ifdef CURRENT_PTR
+struct task_struct * const ___current = &init_task;
+struct task_struct ** const current_ptr = (struct task_struct ** const)&___current;
+struct thread_info * const current_ti = &init_thread_union.thread_info;
+struct thread_info ** const current_ti_ptr = (struct thread_info ** const)&current_ti;
+
+EXPORT_SYMBOL(___current);
+EXPORT_SYMBOL(current_ti);
+
+/*
+ * The scheduler itself doesnt want 'current' to be cached
+ * during context-switches:
+ */
+# undef current
+# define current __current()
+# undef current_thread_info
+# define current_thread_info() __current_thread_info()
+#endif
+
static inline int rt_policy(int policy)
{
if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@ -497,6 +540,7 @@ struct rt_rq {
int overloaded;
struct plist_head pushable_tasks;
#endif
+ unsigned long rt_nr_uninterruptible;
int rt_throttled;
u64 rt_time;
u64 rt_runtime;
@@ -602,6 +646,8 @@ struct rq {
*/
unsigned long nr_uninterruptible;
+ unsigned long switch_timestamp;
+ unsigned long slice_avg;
struct task_struct *curr, *idle;
unsigned long next_balance;
struct mm_struct *prev_mm;
@@ -660,6 +706,13 @@ struct rq {
/* BKL stats */
unsigned int bkl_count;
+
+ /* RT-overload stats: */
+ unsigned long rto_schedule;
+ unsigned long rto_schedule_tail;
+ unsigned long rto_wakeup;
+ unsigned long rto_pulled;
+ unsigned long rto_pushed;
#endif
};
@@ -892,11 +945,23 @@ static inline u64 global_rt_runtime(void)
return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
}
+/*
+ * We really dont want to do anything complex within switch_to()
+ * on PREEMPT_RT - this check enforces this.
+ */
+#ifdef prepare_arch_switch
+# ifdef CONFIG_PREEMPT_RT
+# error FIXME
+# else
+# define _finish_arch_switch finish_arch_switch
+# endif
+#endif
+
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
#endif
#ifndef finish_arch_switch
-# define finish_arch_switch(prev) do { } while (0)
+# define _finish_arch_switch(prev) do { } while (0)
#endif
static inline int task_current(struct rq *rq, struct task_struct *p)
@@ -927,7 +992,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
*/
spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
- atomic_spin_unlock_irq(&rq->lock);
+ atomic_spin_unlock(&rq->lock);
}
#else /* __ARCH_WANT_UNLOCKED_CTXSW */
@@ -968,8 +1033,8 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
smp_wmb();
prev->oncpu = 0;
#endif
-#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- local_irq_enable();
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ local_irq_disable();
#endif
}
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
@@ -1837,6 +1902,8 @@ static inline int normal_prio(struct task_struct *p)
prio = MAX_RT_PRIO-1 - p->rt_priority;
else
prio = __normal_prio(p);
+
+// trace_special_pid(p->pid, PRIO(p), __PRIO(prio));
return prio;
}
@@ -2447,6 +2514,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int sync, int mutex)
}
#endif
+#ifdef CONFIG_PREEMPT_RT
+ /*
+ * sync wakeups can increase wakeup latencies:
+ */
+ if (rt_task(p))
+ sync = 0;
+#endif
smp_wmb();
rq = task_rq_lock(p, &flags);
update_rq_clock(rq);
@@ -2855,7 +2929,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
* Manfred Spraul <manfred@colorfullife.com>
*/
prev_state = prev->state;
- finish_arch_switch(prev);
+ _finish_arch_switch(prev);
perf_counter_task_sched_in(current, cpu_of(rq));
finish_lock_switch(rq, prev);
#ifdef CONFIG_SMP
@@ -2883,12 +2957,15 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
asmlinkage void schedule_tail(struct task_struct *prev)
__releases(rq->lock)
{
- struct rq *rq = this_rq();
-
- finish_task_switch(rq, prev);
+ preempt_disable();
+ finish_task_switch(this_rq(), prev);
+ __preempt_enable_no_resched();
+ local_irq_enable();
#ifdef __ARCH_WANT_UNLOCKED_CTXSW
/* In this case, finish_task_switch does not reenable preemption */
preempt_enable();
+#else
+ preempt_check_resched();
#endif
if (current->set_child_tid)
put_user(task_pid_vnr(current), current->set_child_tid);
@@ -2936,6 +3013,11 @@ context_switch(struct rq *rq, struct task_struct *prev,
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
#endif
+#ifdef CURRENT_PTR
+ barrier();
+ *current_ptr = next;
+ *current_ti_ptr = next->thread_info;
+#endif
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
@@ -2982,6 +3064,11 @@ unsigned long nr_uninterruptible(void)
return sum;
}
+unsigned long nr_uninterruptible_cpu(int cpu)
+{
+ return cpu_rq(cpu)->nr_uninterruptible;
+}
+
unsigned long long nr_context_switches(void)
{
int i;
@@ -5184,6 +5271,8 @@ void scheduler_tick(void)
sched_clock_tick();
+ BUG_ON(!irqs_disabled());
+
atomic_spin_lock(&rq->lock);
update_rq_clock(rq);
update_cpu_load(rq);
@@ -5277,8 +5366,8 @@ static noinline void __schedule_bug(struct task_struct *prev)
{
struct pt_regs *regs = get_irq_regs();
- printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
- prev->comm, prev->pid, preempt_count());
+ printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d, CPU#%d\n",
+ prev->comm, preempt_count(), prev->pid, smp_processor_id());
debug_show_held_locks(prev);
print_modules();
@@ -5296,12 +5385,14 @@ static noinline void __schedule_bug(struct task_struct *prev)
*/
static inline void schedule_debug(struct task_struct *prev)
{
+// WARN_ON(system_state == SYSTEM_BOOTING);
+
/*
* Test if we are atomic. Since do_exit() needs to call into
* schedule() atomically, we ignore that path for now.
* Otherwise, whine if we are scheduling when we should not be.
*/
- if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
+ if (unlikely(in_atomic() && !prev->exit_state))
__schedule_bug(prev);
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -5386,10 +5477,11 @@ asmlinkage void __sched __schedule(void)
switch_count = &prev->nivcsw;
release_kernel_lock(prev);
-need_resched_nonpreemptible:
schedule_debug(prev);
+ preempt_disable();
+
if (sched_feat(HRTICK))
hrtick_clear(rq);
@@ -5401,11 +5493,16 @@ need_resched_nonpreemptible:
!(preempt_count() & PREEMPT_ACTIVE)) {
if (unlikely(signal_pending_state(prev->state, prev)))
prev->state = TASK_RUNNING;
- else
+ else {
+ touch_softlockup_watchdog();
deactivate_task(rq, prev, 1);
+ }
switch_count = &prev->nvcsw;
}
+ if (preempt_count() & PREEMPT_ACTIVE)
+ sub_preempt_count(PREEMPT_ACTIVE);
+
#ifdef CONFIG_SMP
if (prev->sched_class->pre_schedule)
prev->sched_class->pre_schedule(rq, prev);
@@ -5432,19 +5529,22 @@ need_resched_nonpreemptible:
*/
cpu = smp_processor_id();
rq = cpu_rq(cpu);
- } else
- atomic_spin_unlock_irq(&rq->lock);
+ __preempt_enable_no_resched();
+ } else {
+ __preempt_enable_no_resched();
+ atomic_spin_unlock(&rq->lock);
+ }
- if (unlikely(reacquire_kernel_lock(current) < 0))
- goto need_resched_nonpreemptible;
+ reacquire_kernel_lock(current);
}
asmlinkage void __sched schedule(void)
{
need_resched:
- preempt_disable();
+ local_irq_disable();
__schedule();
- __preempt_enable_no_resched();
+ local_irq_enable();
+
if (need_resched())
goto need_resched;
}
@@ -5512,6 +5612,35 @@ out:
#endif
#ifdef CONFIG_PREEMPT
+
+/*
+ * Global flag to turn preemption off on a CONFIG_PREEMPT kernel:
+ */
+int kernel_preemption = 1;
+
+static int __init preempt_setup (char *str)
+{
+ if (!strncmp(str, "off", 3)) {
+ if (kernel_preemption) {
+ printk(KERN_INFO "turning off kernel preemption!\n");
+ kernel_preemption = 0;
+ }
+ return 1;
+ }
+ if (!strncmp(str, "on", 2)) {
+ if (!kernel_preemption) {
+ printk(KERN_INFO "turning on kernel preemption!\n");
+ kernel_preemption = 1;
+ }
+ return 1;
+ }
+ get_option(&str, &kernel_preemption);
+
+ return 1;
+}
+
+__setup("preempt=", preempt_setup);
+
/*
* this is the entry point to schedule() from in-kernel preemption
* off of preempt_enable. Kernel preemptions off return from interrupt
@@ -5523,6 +5652,8 @@ asmlinkage void __sched preempt_schedule(void)
struct task_struct *task = current;
int saved_lock_depth;
+ if (!kernel_preemption)
+ return;
/*
* If there is a non-zero preempt_count or interrupts are disabled,
* we do not want to preempt the current task. Just return..
@@ -5531,6 +5662,7 @@ asmlinkage void __sched preempt_schedule(void)
return;
do {
+ local_irq_disable();
add_preempt_count(PREEMPT_ACTIVE);
/*
@@ -5540,9 +5672,9 @@ asmlinkage void __sched preempt_schedule(void)
*/
saved_lock_depth = task->lock_depth;
task->lock_depth = -1;
- schedule();
+ __schedule();
task->lock_depth = saved_lock_depth;
- sub_preempt_count(PREEMPT_ACTIVE);
+ local_irq_enable();
/*
* Check again in case we missed a preemption opportunity
@@ -5554,10 +5686,10 @@ asmlinkage void __sched preempt_schedule(void)
EXPORT_SYMBOL(preempt_schedule);
/*
- * this is the entry point to schedule() from kernel preemption
- * off of irq context.
- * Note, that this is called and return with irqs disabled. This will
- * protect us against recursive calling from irq.
+ * this is is the entry point for the IRQ return path. Called with
+ * interrupts disabled. To avoid infinite irq-entry recursion problems
+ * with fast-paced IRQ sources we do all of this carefully to never
+ * enable interrupts again.
*/
asmlinkage void __sched preempt_schedule_irq(void)
{
@@ -5565,10 +5697,17 @@ asmlinkage void __sched preempt_schedule_irq(void)
struct task_struct *task = current;
int saved_lock_depth;
- /* Catch callers which need to be fixed */
- WARN_ON_ONCE(ti->preempt_count || !irqs_disabled());
+ if (!kernel_preemption)
+ return;
+ /*
+ * If there is a non-zero preempt_count then just return.
+ * (interrupts are disabled)
+ */
+ if (unlikely(ti->preempt_count))
+ return;
do {
+ local_irq_disable();
add_preempt_count(PREEMPT_ACTIVE);
/*
@@ -5578,11 +5717,9 @@ asmlinkage void __sched preempt_schedule_irq(void)
*/
saved_lock_depth = task->lock_depth;
task->lock_depth = -1;
- local_irq_enable();
- schedule();
+ __schedule();
local_irq_disable();
task->lock_depth = saved_lock_depth;
- sub_preempt_count(PREEMPT_ACTIVE);
/*
* Check again in case we missed a preemption opportunity
@@ -6002,6 +6139,7 @@ void task_setprio(struct task_struct *p, int prio)
check_class_changed(rq, p, prev_class, oldprio, running);
}
+
task_rq_unlock(rq, &flags);
}
@@ -6642,6 +6780,7 @@ SYSCALL_DEFINE0(sched_yield)
__release(rq->lock);
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
_raw_spin_unlock(&rq->lock);
+ local_irq_enable();
preempt_enable_and_schedule();
@@ -6653,9 +6792,40 @@ static inline int should_resched(void)
return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
}
+#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT)
+void __might_sleep(char *file, int line)
+{
+#ifdef in_atomic
+ static unsigned long prev_jiffy; /* ratelimiting */
+
+ if ((!in_atomic() && !irqs_disabled()) ||
+ system_state != SYSTEM_RUNNING || oops_in_progress)
+ return;
+
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+ return;
+ prev_jiffy = jiffies;
+
+ printk(KERN_ERR
+ "BUG: sleeping function called from invalid context at %s:%d\n",
+ file, line);
+ printk(KERN_ERR
+ "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(),
+ current->pid, current->comm);
+
+ debug_show_held_locks(current);
+ if (irqs_disabled())
+ print_irqtrace_events(current);
+ dump_stack();
+#endif
+}
+EXPORT_SYMBOL(__might_sleep);
+#endif
+
static void __cond_resched(void)
{
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT)
__might_sleep(__FILE__, __LINE__);
#endif
/*
@@ -6664,10 +6834,11 @@ static void __cond_resched(void)
* cond_resched() call.
*/
do {
+ local_irq_disable();
add_preempt_count(PREEMPT_ACTIVE);
- schedule();
- sub_preempt_count(PREEMPT_ACTIVE);
+ __schedule();
} while (need_resched());
+ local_irq_enable();
}
int __sched _cond_resched(void)
@@ -6711,7 +6882,11 @@ EXPORT_SYMBOL(cond_resched_lock);
*/
int __sched cond_resched_softirq(void)
{
+#ifndef CONFIG_PREEMPT_SOFTIRQS
WARN_ON_ONCE(!in_softirq());
+ if (!in_softirq())
+ return 0;
+#endif
if (should_resched()) {
local_bh_enable();
@@ -6742,17 +6917,56 @@ int __sched cond_resched_softirq_context(void)
}
EXPORT_SYMBOL(cond_resched_softirq_context);
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+int voluntary_preemption = 1;
+EXPORT_SYMBOL(voluntary_preemption);
+
+static int __init voluntary_preempt_setup (char *str)
+{
+ if (!strncmp(str, "off", 3))
+ voluntary_preemption = 0;
+ else
+ get_option(&str, &voluntary_preemption);
+ if (!voluntary_preemption)
+ printk("turning off voluntary preemption!\n");
+
+ return 1;
+}
+
+__setup("voluntary-preempt=", voluntary_preempt_setup);
+
+#endif
+
/**
* yield - yield the current processor to other threads.
*
* This is a shortcut for kernel-space yielding - it marks the
* thread runnable and calls sys_sched_yield().
*/
-void __sched yield(void)
+void __sched __yield(void)
{
set_current_state(TASK_RUNNING);
sys_sched_yield();
}
+
+void __sched yield(void)
+{
+ static int once = 1;
+
+ /*
+ * it's a bug to rely on yield() with RT priorities. We print
+ * the first occurance after bootup ... this will still give
+ * us an idea about the scope of the problem, without spamming
+ * the syslog:
+ */
+ if (once && rt_task(current)) {
+ once = 0;
+ printk(KERN_ERR "BUG: %s:%d RT task yield()-ing!\n",
+ current->comm, current->pid);
+ dump_stack();
+ }
+ __yield();
+}
EXPORT_SYMBOL(yield);
/*
@@ -6926,6 +7140,7 @@ void sched_show_task(struct task_struct *p)
void show_state_filter(unsigned long state_filter)
{
struct task_struct *g, *p;
+ int do_unlock = 1;
#if BITS_PER_LONG == 32
printk(KERN_INFO
@@ -6934,7 +7149,16 @@ void show_state_filter(unsigned long state_filter)
printk(KERN_INFO
" task PC stack pid father\n");
#endif
+#ifdef CONFIG_PREEMPT_RT
+ if (!read_trylock(&tasklist_lock)) {
+ printk("hm, tasklist_lock write-locked.\n");
+ printk("ignoring ...\n");
+ do_unlock = 0;
+ }
+#else
read_lock(&tasklist_lock);
+#endif
+
do_each_thread(g, p) {
/*
* reset the NMI-timeout, listing all files on a slow
@@ -6950,7 +7174,8 @@ void show_state_filter(unsigned long state_filter)
#ifdef CONFIG_SCHED_DEBUG
sysrq_sched_debug_show();
#endif
- read_unlock(&tasklist_lock);
+ if (do_unlock)
+ read_unlock(&tasklist_lock);
/*
* Only show locks if all tasks are dumped:
*/
@@ -7122,11 +7347,18 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
{
struct rq *rq_dest, *rq_src;
+ unsigned long flags;
int ret = 0, on_rq;
if (unlikely(!cpu_active(dest_cpu)))
return ret;
+ /*
+ * PREEMPT_RT: this relies on write_lock_irq(&tasklist_lock)
+ * disabling interrupts - which on PREEMPT_RT does not do:
+ */
+ local_irq_save(flags);
+
rq_src = cpu_rq(src_cpu);
rq_dest = cpu_rq(dest_cpu);
@@ -7151,6 +7383,8 @@ done:
ret = 1;
fail:
double_rq_unlock(rq_src, rq_dest);
+ local_irq_restore(flags);
+
return ret;
}
@@ -9470,6 +9704,9 @@ void __init sched_init(void)
atomic_inc(&init_mm.mm_count);
enter_lazy_tlb(&init_mm, current);
+#ifdef CONFIG_PREEMPT_RT
+ printk("Real-Time Preemption Support (C) 2004-2007 Ingo Molnar\n");
+#endif
/*
* Make us the idle thread. Technically, schedule() should not be
* called from this thread, however somewhere below it might be,
@@ -9500,36 +9737,6 @@ void __init sched_init(void)
scheduler_running = 1;
}
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
-void __might_sleep(char *file, int line)
-{
-#ifdef in_atomic
- static unsigned long prev_jiffy; /* ratelimiting */
-
- if ((!in_atomic() && !irqs_disabled()) ||
- system_state != SYSTEM_RUNNING || oops_in_progress)
- return;
- if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
- return;
- prev_jiffy = jiffies;
-
- printk(KERN_ERR
- "BUG: sleeping function called from invalid context at %s:%d\n",
- file, line);
- printk(KERN_ERR
- "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
- in_atomic(), irqs_disabled(),
- current->pid, current->comm);
-
- debug_show_held_locks(current);
- if (irqs_disabled())
- print_irqtrace_events(current);
- dump_stack();
-#endif
-}
-EXPORT_SYMBOL(__might_sleep);
-#endif
-
#ifdef CONFIG_MAGIC_SYSRQ
static void normalize_task(struct rq *rq, struct task_struct *p)
{
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 54779ebe48cc..5d6bb327121d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -860,6 +860,48 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
}
}
+static inline void incr_rt_nr_uninterruptible(struct task_struct *p,
+ struct rq *rq)
+{
+ rq->rt.rt_nr_uninterruptible++;
+}
+
+static inline void decr_rt_nr_uninterruptible(struct task_struct *p,
+ struct rq *rq)
+{
+ rq->rt.rt_nr_uninterruptible--;
+}
+
+unsigned long rt_nr_running(void)
+{
+ unsigned long i, sum = 0;
+
+ for_each_online_cpu(i)
+ sum += cpu_rq(i)->rt.rt_nr_running;
+
+ return sum;
+}
+
+unsigned long rt_nr_running_cpu(int cpu)
+{
+ return cpu_rq(cpu)->rt.rt_nr_running;
+}
+
+unsigned long rt_nr_uninterruptible(void)
+{
+ unsigned long i, sum = 0;
+
+ for_each_online_cpu(i)
+ sum += cpu_rq(i)->rt.rt_nr_uninterruptible;
+
+ return sum;
+}
+
+unsigned long rt_nr_uninterruptible_cpu(int cpu)
+{
+ return cpu_rq(cpu)->rt.rt_nr_uninterruptible;
+}
+
/*
* Adding/removing a task to/from a priority array:
*/
@@ -872,6 +914,9 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
enqueue_rt_entity(rt_se);
+ if (p->state == TASK_UNINTERRUPTIBLE)
+ decr_rt_nr_uninterruptible(p, rq);
+
if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
@@ -883,6 +928,10 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
struct sched_rt_entity *rt_se = &p->rt;
update_curr_rt(rq);
+
+ if (p->state == TASK_UNINTERRUPTIBLE)
+ incr_rt_nr_uninterruptible(p, rq);
+
dequeue_rt_entity(rt_se);
dequeue_pushable_task(rq, p);
@@ -1462,8 +1511,10 @@ static int pull_rt_task(struct rq *this_rq)
static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
{
/* Try to pull RT tasks here if we lower this rq's prio */
- if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
+ if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio) {
pull_rt_task(rq);
+ schedstat_inc(rq, rto_schedule);
+ }
}
/*
@@ -1545,7 +1596,6 @@ static void set_cpus_allowed_rt(struct task_struct *p,
*/
if (weight > 1)
enqueue_pushable_task(rq, p);
-
}
if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c
index 7e2ad7bd4223..54625bec6fb9 100644
--- a/lib/kernel_lock.c
+++ b/lib/kernel_lock.c
@@ -41,15 +41,15 @@ int __lockfunc __reacquire_kernel_lock(void)
struct task_struct *task = current;
int saved_lock_depth = task->lock_depth;
+ local_irq_enable();
BUG_ON(saved_lock_depth < 0);
task->lock_depth = -1;
- __preempt_enable_no_resched();
down(&kernel_sem);
- preempt_disable();
task->lock_depth = saved_lock_depth;
+ local_irq_enable();
return 0;
}