summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/i386/Kconfig9
-rw-r--r--arch/i386/kernel/apic.c3
-rw-r--r--arch/i386/kernel/io_apic.c2
-rw-r--r--arch/i386/kernel/irq.c67
-rw-r--r--arch/i386/kernel/process.c39
-rw-r--r--arch/i386/kernel/smp.c24
-rw-r--r--arch/i386/kernel/smpboot.c98
-rw-r--r--arch/i386/kernel/traps.c8
-rw-r--r--arch/ia64/kernel/smpboot.c1
-rw-r--r--arch/ppc64/kernel/pSeries_smp.c5
-rw-r--r--arch/s390/kernel/smp.c4
-rw-r--r--include/asm-i386/cpu.h2
-rw-r--r--include/asm-i386/irq.h4
-rw-r--r--include/asm-i386/smp.h3
-rw-r--r--kernel/cpu.c14
15 files changed, 243 insertions, 40 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index d4ae5f9ceae6..b4cd11e58451 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -1250,6 +1250,15 @@ config SCx200
This support is also available as a module. If compiled as a
module, it will be called scx200.
+config HOTPLUG_CPU
+ bool "Support for hot-pluggable CPUs (EXPERIMENTAL)"
+ depends on SMP && HOTPLUG && EXPERIMENTAL
+ ---help---
+ Say Y here to experiment with turning CPUs off and on. CPUs
+ can be controlled through /sys/devices/system/cpu.
+
+ Say N.
+
source "drivers/pcmcia/Kconfig"
source "drivers/pci/hotplug/Kconfig"
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 8d993fa71754..a28a088f3e75 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -26,6 +26,7 @@
#include <linux/mc146818rtc.h>
#include <linux/kernel_stat.h>
#include <linux/sysdev.h>
+#include <linux/cpu.h>
#include <asm/atomic.h>
#include <asm/smp.h>
@@ -1048,7 +1049,7 @@ void __init setup_secondary_APIC_clock(void)
setup_APIC_timer(calibration_result);
}
-void __init disable_APIC_timer(void)
+void __devinit disable_APIC_timer(void)
{
if (using_apic_timer) {
unsigned long v;
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 08540bc4ba3e..3c2b3bdfc807 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -576,9 +576,11 @@ static int balanced_irq(void *unused)
try_to_freeze(PF_FREEZE);
if (time_after(jiffies,
prev_balance_time+balanced_irq_interval)) {
+ preempt_disable();
do_irq_balance();
prev_balance_time = jiffies;
time_remaining = balanced_irq_interval;
+ preempt_enable();
}
}
return 0;
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 73945a3c53c4..af115004aec5 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -15,6 +15,9 @@
#include <linux/seq_file.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_maxaligned_in_smp;
EXPORT_PER_CPU_SYMBOL(irq_stat);
@@ -210,9 +213,8 @@ int show_interrupts(struct seq_file *p, void *v)
if (i == 0) {
seq_printf(p, " ");
- for (j=0; j<NR_CPUS; j++)
- if (cpu_online(j))
- seq_printf(p, "CPU%d ",j);
+ for_each_cpu(j)
+ seq_printf(p, "CPU%d ",j);
seq_putc(p, '\n');
}
@@ -225,9 +227,8 @@ int show_interrupts(struct seq_file *p, void *v)
#ifndef CONFIG_SMP
seq_printf(p, "%10u ", kstat_irqs(i));
#else
- for (j = 0; j < NR_CPUS; j++)
- if (cpu_online(j))
- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+ for_each_cpu(j)
+ seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
#endif
seq_printf(p, " %14s", irq_desc[i].handler->typename);
seq_printf(p, " %s", action->name);
@@ -240,16 +241,14 @@ skip:
spin_unlock_irqrestore(&irq_desc[i].lock, flags);
} else if (i == NR_IRQS) {
seq_printf(p, "NMI: ");
- for (j = 0; j < NR_CPUS; j++)
- if (cpu_online(j))
- seq_printf(p, "%10u ", nmi_count(j));
+ for_each_cpu(j)
+ seq_printf(p, "%10u ", nmi_count(j));
seq_putc(p, '\n');
#ifdef CONFIG_X86_LOCAL_APIC
seq_printf(p, "LOC: ");
- for (j = 0; j < NR_CPUS; j++)
- if (cpu_online(j))
- seq_printf(p, "%10u ",
- per_cpu(irq_stat,j).apic_timer_irqs);
+ for_each_cpu(j)
+ seq_printf(p, "%10u ",
+ per_cpu(irq_stat,j).apic_timer_irqs);
seq_putc(p, '\n');
#endif
seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
@@ -259,3 +258,45 @@ skip:
}
return 0;
}
+
+#ifdef CONFIG_HOTPLUG_CPU
+#include <mach_apic.h>
+
+void fixup_irqs(cpumask_t map)
+{
+ unsigned int irq;
+ static int warned;
+
+ for (irq = 0; irq < NR_IRQS; irq++) {
+ cpumask_t mask;
+ if (irq == 2)
+ continue;
+
+ cpus_and(mask, irq_affinity[irq], map);
+ if (any_online_cpu(mask) == NR_CPUS) {
+ printk("Breaking affinity for irq %i\n", irq);
+ mask = map;
+ }
+ if (irq_desc[irq].handler->set_affinity)
+ irq_desc[irq].handler->set_affinity(irq, mask);
+ else if (irq_desc[irq].action && !(warned++))
+ printk("Cannot set affinity for irq %i\n", irq);
+ }
+
+#if 0
+ barrier();
+ /* Ingo Molnar says: "after the IO-APIC masks have been redirected
+ [note the nop - the interrupt-enable boundary on x86 is two
+ instructions from sti] - to flush out pending hardirqs and
+ IPIs. After this point nothing is supposed to reach this CPU." */
+ __asm__ __volatile__("sti; nop; cli");
+ barrier();
+#else
+ /* That doesn't seem sufficient. Give it 1ms. */
+ local_irq_enable();
+ mdelay(1);
+ local_irq_disable();
+#endif
+}
+#endif
+
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index aea2ce1145df..c1b11e8df60b 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -13,6 +13,7 @@
#include <stdarg.h>
+#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/fs.h>
@@ -55,6 +56,9 @@
#include <linux/irq.h>
#include <linux/err.h>
+#include <asm/tlbflush.h>
+#include <asm/cpu.h>
+
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
static int hlt_counter;
@@ -143,14 +147,44 @@ static void poll_idle (void)
}
}
+#ifdef CONFIG_HOTPLUG_CPU
+#include <asm/nmi.h>
+/* We don't actually take CPU down, just spin without interrupts. */
+static inline void play_dead(void)
+{
+ /* Ack it */
+ __get_cpu_var(cpu_state) = CPU_DEAD;
+
+ /* We shouldn't have to disable interrupts while dead, but
+ * some interrupts just don't seem to go away, and this makes
+ * it "work" for testing purposes. */
+ /* Death loop */
+ while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE)
+ cpu_relax();
+
+ local_irq_disable();
+ __flush_tlb_all();
+ cpu_set(smp_processor_id(), cpu_online_map);
+ enable_APIC_timer();
+ local_irq_enable();
+}
+#else
+static inline void play_dead(void)
+{
+ BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
/*
* The idle thread. There's no useful work to be
* done, so just try to conserve power and have a
* low exit latency (ie sit in a loop waiting for
* somebody to say that they'd like to reschedule)
*/
-void cpu_idle (void)
+void cpu_idle(void)
{
+ int cpu = raw_smp_processor_id();
+
/* endless idle loop with no priority at all */
while (1) {
while (!need_resched()) {
@@ -165,6 +199,9 @@ void cpu_idle (void)
if (!idle)
idle = default_idle;
+ if (cpu_is_offline(cpu))
+ play_dead();
+
__get_cpu_var(irq_stat).idle_timestamp = jiffies;
idle();
}
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 68be7d0c7238..35f521612b20 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -19,6 +19,7 @@
#include <linux/mc146818rtc.h>
#include <linux/cache.h>
#include <linux/interrupt.h>
+#include <linux/cpu.h>
#include <linux/module.h>
#include <asm/mtrr.h>
@@ -164,7 +165,7 @@ void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
unsigned long flags;
local_irq_save(flags);
-
+ WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
/*
* Wait for idle.
*/
@@ -346,21 +347,21 @@ out:
static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
unsigned long va)
{
- cpumask_t tmp;
/*
* A couple of (to be removed) sanity checks:
*
- * - we do not send IPIs to not-yet booted CPUs.
* - current CPU must not be in mask
* - mask must exist :)
*/
BUG_ON(cpus_empty(cpumask));
-
- cpus_and(tmp, cpumask, cpu_online_map);
- BUG_ON(!cpus_equal(cpumask, tmp));
BUG_ON(cpu_isset(smp_processor_id(), cpumask));
BUG_ON(!mm);
+ /* If a CPU which we ran on has gone down, OK. */
+ cpus_and(cpumask, cpumask, cpu_online_map);
+ if (cpus_empty(cpumask))
+ return;
+
/*
* i'm not happy about this global shared spinlock in the
* MM hot path, but we'll see how contended it is.
@@ -476,6 +477,7 @@ void flush_tlb_all(void)
*/
void smp_send_reschedule(int cpu)
{
+ WARN_ON(cpu_is_offline(cpu));
send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
}
@@ -516,10 +518,15 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
*/
{
struct call_data_struct data;
- int cpus = num_online_cpus()-1;
+ int cpus;
- if (!cpus)
+ /* Holding any lock stops cpus from going down. */
+ spin_lock(&call_lock);
+ cpus = num_online_cpus() - 1;
+ if (!cpus) {
+ spin_unlock(&call_lock);
return 0;
+ }
/* Can deadlock when called with interrupts disabled */
WARN_ON(irqs_disabled());
@@ -531,7 +538,6 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
if (wait)
atomic_set(&data.finished, 0);
- spin_lock(&call_lock);
call_data = &data;
mb();
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index c20d96d5c15c..ad74a46e9ef0 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -44,6 +44,9 @@
#include <linux/smp_lock.h>
#include <linux/irq.h>
#include <linux/bootmem.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/percpu.h>
#include <linux/delay.h>
#include <linux/mc146818rtc.h>
@@ -96,6 +99,9 @@ static int trampoline_exec;
static void map_cpu_to_logical_apicid(void);
+/* State of each CPU. */
+DEFINE_PER_CPU(int, cpu_state) = { 0 };
+
/*
* Currently trivial. Write the real->protected mode
* bootstrap into the page concerned. The caller
@@ -1119,6 +1125,9 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
void __init smp_prepare_cpus(unsigned int max_cpus)
{
+ smp_commenced_mask = cpumask_of_cpu(0);
+ cpu_callin_map = cpumask_of_cpu(0);
+ mb();
smp_boot_cpus(max_cpus);
}
@@ -1128,20 +1137,99 @@ void __devinit smp_prepare_boot_cpu(void)
cpu_set(smp_processor_id(), cpu_callout_map);
}
-int __devinit __cpu_up(unsigned int cpu)
+#ifdef CONFIG_HOTPLUG_CPU
+
+/* must be called with the cpucontrol mutex held */
+static int __devinit cpu_enable(unsigned int cpu)
{
- /* This only works at boot for x86. See "rewrite" above. */
- if (cpu_isset(cpu, smp_commenced_mask)) {
- local_irq_enable();
- return -ENOSYS;
+ /* get the target out of its holding state */
+ per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+ wmb();
+
+ /* wait for the processor to ack it. timeout? */
+ while (!cpu_online(cpu))
+ cpu_relax();
+
+ fixup_irqs(cpu_online_map);
+ /* counter the disable in fixup_irqs() */
+ local_irq_enable();
+ return 0;
+}
+
+int __cpu_disable(void)
+{
+ cpumask_t map = cpu_online_map;
+ int cpu = smp_processor_id();
+
+ /*
+ * Perhaps use cpufreq to drop frequency, but that could go
+ * into generic code.
+ *
+ * We won't take down the boot processor on i386 due to some
+ * interrupts only being able to be serviced by the BSP.
+ * Especially so if we're not using an IOAPIC -zwane
+ */
+ if (cpu == 0)
+ return -EBUSY;
+
+ /* We enable the timer again on the exit path of the death loop */
+ disable_APIC_timer();
+ /* Allow any queued timer interrupts to get serviced */
+ local_irq_enable();
+ mdelay(1);
+ local_irq_disable();
+
+ cpu_clear(cpu, map);
+ fixup_irqs(map);
+ /* It's now safe to remove this processor from the online map */
+ cpu_clear(cpu, cpu_online_map);
+ return 0;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+ /* We don't do anything here: idle task is faking death itself. */
+ unsigned int i;
+
+ for (i = 0; i < 10; i++) {
+ /* They ack this in play_dead by setting CPU_DEAD */
+ if (per_cpu(cpu_state, cpu) == CPU_DEAD)
+ return;
+ current->state = TASK_UNINTERRUPTIBLE;
+ schedule_timeout(HZ/10);
}
+ printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+}
+#else /* ... !CONFIG_HOTPLUG_CPU */
+int __cpu_disable(void)
+{
+ return -ENOSYS;
+}
+void __cpu_die(unsigned int cpu)
+{
+ /* We said "no" in __cpu_disable */
+ BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+int __devinit __cpu_up(unsigned int cpu)
+{
/* In case one didn't come up */
if (!cpu_isset(cpu, cpu_callin_map)) {
+ printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu);
local_irq_enable();
return -EIO;
}
+#ifdef CONFIG_HOTPLUG_CPU
+ /* Already up, and in cpu_quiescent now? */
+ if (cpu_isset(cpu, smp_commenced_mask)) {
+ cpu_enable(cpu);
+ return 0;
+ }
+#endif
+
local_irq_enable();
/* Unleash the CPU! */
cpu_set(cpu, smp_commenced_mask);
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index e4d4e2162c7a..207ea8ba7169 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -625,6 +625,14 @@ fastcall void do_nmi(struct pt_regs * regs, long error_code)
nmi_enter();
cpu = smp_processor_id();
+
+#ifdef CONFIG_HOTPLUG_CPU
+ if (!cpu_online(cpu)) {
+ nmi_exit();
+ return;
+ }
+#endif
+
++nmi_count(cpu);
if (!nmi_callback(regs, cpu))
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 3865f088ffa2..a888ddc10f7d 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -688,6 +688,7 @@ int __cpu_disable(void)
return -EBUSY;
remove_siblinginfo(cpu);
+ cpu_clear(cpu, cpu_online_map);
fixup_irqs();
local_flush_tlb_all();
cpu_clear(cpu, cpu_callin_map);
diff --git a/arch/ppc64/kernel/pSeries_smp.c b/arch/ppc64/kernel/pSeries_smp.c
index 30154140f7e2..62c55a123560 100644
--- a/arch/ppc64/kernel/pSeries_smp.c
+++ b/arch/ppc64/kernel/pSeries_smp.c
@@ -93,10 +93,13 @@ static int query_cpu_stopped(unsigned int pcpu)
int pSeries_cpu_disable(void)
{
+ int cpu = smp_processor_id();
+
+ cpu_clear(cpu, cpu_online_map);
systemcfg->processorCount--;
/*fix boot_cpuid here*/
- if (smp_processor_id() == boot_cpuid)
+ if (cpu == boot_cpuid)
boot_cpuid = any_online_cpu(cpu_online_map);
/* FIXME: abstract this to not be platform specific later on */
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index fdfcf0488b49..93c71fef99dc 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -679,12 +679,14 @@ __cpu_disable(void)
{
unsigned long flags;
ec_creg_mask_parms cr_parms;
+ int cpu = smp_processor_id();
spin_lock_irqsave(&smp_reserve_lock, flags);
- if (smp_cpu_reserved[smp_processor_id()] != 0) {
+ if (smp_cpu_reserved[cpu] != 0) {
spin_unlock_irqrestore(&smp_reserve_lock, flags);
return -EBUSY;
}
+ cpu_clear(cpu, cpu_online_map);
#ifdef CONFIG_PFAULT
/* Disable pfault pseudo page faults on this cpu. */
diff --git a/include/asm-i386/cpu.h b/include/asm-i386/cpu.h
index 002740b21951..e7252c216ca8 100644
--- a/include/asm-i386/cpu.h
+++ b/include/asm-i386/cpu.h
@@ -5,6 +5,7 @@
#include <linux/cpu.h>
#include <linux/topology.h>
#include <linux/nodemask.h>
+#include <linux/percpu.h>
#include <asm/node.h>
@@ -16,4 +17,5 @@ extern int arch_register_cpu(int num);
extern void arch_unregister_cpu(int);
#endif
+DECLARE_PER_CPU(int, cpu_state);
#endif /* _ASM_I386_CPU_H_ */
diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h
index 05b9e61b0a72..e2d8bf23ad70 100644
--- a/include/asm-i386/irq.h
+++ b/include/asm-i386/irq.h
@@ -38,4 +38,8 @@ extern void release_vm86_irqs(struct task_struct *);
extern int irqbalance_disable(char *str);
#endif
+#ifdef CONFIG_HOTPLUG_CPU
+extern void fixup_irqs(cpumask_t map);
+#endif
+
#endif /* _ASM_IRQ_H */
diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h
index 55ef31f66bbe..507f2fd39a6a 100644
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -83,6 +83,9 @@ static __inline int logical_smp_processor_id(void)
}
#endif
+
+extern int __cpu_disable(void);
+extern void __cpu_die(unsigned int cpu);
#endif /* !__ASSEMBLY__ */
#define NO_PROC_ID 0xFF /* No processor magic marker */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 628f4ccda127..53d8263ae12e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -63,19 +63,15 @@ static int take_cpu_down(void *unused)
{
int err;
- /* Take offline: makes arch_cpu_down somewhat easier. */
- cpu_clear(smp_processor_id(), cpu_online_map);
-
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
if (err < 0)
- cpu_set(smp_processor_id(), cpu_online_map);
- else
- /* Force idle task to run as soon as we yield: it should
- immediately notice cpu is offline and die quickly. */
- sched_idle_next();
+ return err;
- return err;
+ /* Force idle task to run as soon as we yield: it should
+ immediately notice cpu is offline and die quickly. */
+ sched_idle_next();
+ return 0;
}
int cpu_down(unsigned int cpu)