From 0016a4cf5582415849fafbf9f019dd9530824789 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 15 Jun 2010 14:48:58 +1000
Subject: powerpc: Emulate most Book I instructions in emulate_step()

This extends the emulate_step() function to handle a large proportion
of the Book I instructions implemented on current 64-bit server
processors.  The aim is to handle all the load and store instructions
used in the kernel, plus all of the instructions that appear between
l[wd]arx and st[wd]cx., so this handles the Altivec/VMX lvx and stvx
and the VSX lxv2dx and stxv2dx instructions (implemented in POWER7).

The new code can emulate user mode instructions, and checks the
effective address for a load or store if the saved state is for
user mode.  It doesn't handle little-endian mode at present.

For floating-point, Altivec/VMX and VSX instructions, it checks
that the saved MSR has the enable bit for the relevant facility
set, and if so, assumes that the FP/VMX/VSX registers contain
valid state, and does loads or stores directly to/from the
FP/VMX/VSX registers, using assembly helpers in ldstfp.S.

Instructions supported now include:
* Loads and stores, including some but not all VMX and VSX instructions,
  and lmw/stmw
* Atomic loads and stores (l[dw]arx, st[dw]cx.)
* Arithmetic instructions (add, subtract, multiply, divide, etc.)
* Compare instructions
* Rotate and mask instructions
* Shift instructions
* Logical instructions (and, or, xor, etc.)
* Condition register logical instructions
* mtcrf, cntlz[wd], exts[bhw]
* isync, sync, lwsync, ptesync, eieio
* Cache operations (dcbf, dcbst, dcbt, dcbtst)

The overflow-checking arithmetic instructions are not included, but
they appear not to be ever used in C code.

This uses decimal values for the minor opcodes in the switch statements
because that is what appears in the Power ISA specification, thus it is
easier to check that they are correct if they are in decimal.

If this is used to single-step an instruction where a data breakpoint
interrupt occurred, then there is the possibility that the instruction
is a lwarx or ldarx.  In that case we have to be careful not to lose the
reservation until we get to the matching st[wd]cx., or we'll never make
forward progress.  One alternative is to try to arrange that we can
return from interrupts and handle data breakpoint interrupts without
losing the reservation, which means not using any spinlocks, mutexes,
or atomic ops (including bitops).  That seems rather fragile.  The
other alternative is to emulate the larx/stcx and all the instructions
in between.  This is why this commit adds support for a wide range
of integer instructions.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/asm-compat.h | 2 ++
 arch/powerpc/include/asm/ppc-opcode.h | 7 +++++++
 2 files changed, 9 insertions(+)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/asm-compat.h b/arch/powerpc/include/asm/asm-compat.h
index 2048a6aeea91..decad950f11a 100644
--- a/arch/powerpc/include/asm/asm-compat.h
+++ b/arch/powerpc/include/asm/asm-compat.h
@@ -30,6 +30,7 @@
 #define PPC_STLCX	stringify_in_c(stdcx.)
 #define PPC_CNTLZL	stringify_in_c(cntlzd)
 #define PPC_LR_STKOFF	16
+#define PPC_MIN_STKFRM	112
 
 /* Move to CR, single-entry optimized version. Only available
  * on POWER4 and later.
@@ -55,6 +56,7 @@
 #define PPC_CNTLZL	stringify_in_c(cntlzw)
 #define PPC_MTOCRF	stringify_in_c(mtcrf)
 #define PPC_LR_STKOFF	4
+#define PPC_MIN_STKFRM	16
 
 #endif
 
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index d553bbeb726c..43adc8b819ed 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -52,13 +52,17 @@
 #define PPC_INST_WAIT			0x7c00007c
 #define PPC_INST_TLBIVAX		0x7c000624
 #define PPC_INST_TLBSRX_DOT		0x7c0006a5
+#define PPC_INST_XXLOR			0xf0000510
 
 /* macros to insert fields into opcodes */
 #define __PPC_RA(a)	(((a) & 0x1f) << 16)
 #define __PPC_RB(b)	(((b) & 0x1f) << 11)
 #define __PPC_RS(s)	(((s) & 0x1f) << 21)
 #define __PPC_RT(s)	__PPC_RS(s)
+#define __PPC_XA(a)	((((a) & 0x1f) << 16) | (((a) & 0x20) >> 3))
+#define __PPC_XB(b)	((((b) & 0x1f) << 11) | (((b) & 0x20) >> 4))
 #define __PPC_XS(s)	((((s) & 0x1f) << 21) | (((s) & 0x20) >> 5))
+#define __PPC_XT(s)	__PPC_XS(s)
 #define __PPC_T_TLB(t)	(((t) & 0x3) << 21)
 #define __PPC_WC(w)	(((w) & 0x3) << 21)
 /*
@@ -106,9 +110,12 @@
  * the 128 bit load store instructions based on that.
  */
 #define VSX_XX1(s, a, b)	(__PPC_XS(s) | __PPC_RA(a) | __PPC_RB(b))
+#define VSX_XX3(t, a, b)	(__PPC_XT(t) | __PPC_XA(a) | __PPC_XB(b))
 #define STXVD2X(s, a, b)	stringify_in_c(.long PPC_INST_STXVD2X | \
 					       VSX_XX1((s), (a), (b)))
 #define LXVD2X(s, a, b)		stringify_in_c(.long PPC_INST_LXVD2X | \
 					       VSX_XX1((s), (a), (b)))
+#define XXLOR(t, a, b)		stringify_in_c(.long PPC_INST_XXLOR | \
+					       VSX_XX3((t), (a), (b)))
 
 #endif /* _ASM_POWERPC_PPC_OPCODE_H */
-- 
cgit v1.2.3


From 5aae8a53708025d4e718f0d2e7c2f766779ddc71 Mon Sep 17 00:00:00 2001
From: "K.Prasad" <prasad@linux.vnet.ibm.com>
Date: Tue, 15 Jun 2010 11:35:19 +0530
Subject: powerpc, hw_breakpoints: Implement hw_breakpoints for 64-bit server
 processors

Implement perf-events based hw-breakpoint interfaces for PowerPC
64-bit server (Book III S) processors.  This allows access to a
given location to be used as an event that can be counted or
profiled by the perf_events subsystem.

This is done using the DABR (data breakpoint register), which can
also be used for process debugging via ptrace.  When perf_event
hw_breakpoint support is configured in, the perf_event subsystem
manages the DABR and arbitrates access to it, and ptrace then
creates a perf_event when it is requested to set a data breakpoint.

[Adopted suggestions from Paul Mackerras <paulus@samba.org> to
- emulate_step() all system-wide breakpoints and single-step only the
  per-task breakpoints
- perform arch-specific cleanup before unregistration through
  arch_unregister_hw_breakpoint()
]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/Kconfig                     |   1 +
 arch/powerpc/include/asm/cputable.h      |   4 +
 arch/powerpc/include/asm/hw_breakpoint.h |  73 +++++++
 arch/powerpc/include/asm/processor.h     |   8 +
 arch/powerpc/kernel/Makefile             |   1 +
 arch/powerpc/kernel/exceptions-64s.S     |   1 +
 arch/powerpc/kernel/hw_breakpoint.c      | 325 +++++++++++++++++++++++++++++++
 arch/powerpc/kernel/machine_kexec_64.c   |   3 +
 arch/powerpc/kernel/process.c            |  14 ++
 arch/powerpc/kernel/ptrace.c             |  64 ++++++
 arch/powerpc/lib/Makefile                |   1 +
 11 files changed, 495 insertions(+)
 create mode 100644 arch/powerpc/include/asm/hw_breakpoint.h
 create mode 100644 arch/powerpc/kernel/hw_breakpoint.c

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 328774bd41ee..7949afc861dd 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -141,6 +141,7 @@ config PPC
 	select GENERIC_ATOMIC64 if PPC32
 	select HAVE_PERF_EVENTS
 	select HAVE_REGS_AND_STACK_ACCESS_API
+	select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64
 
 config EARLY_PRINTK
 	bool
diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index b0b21134f61a..5e2e2cfcc81b 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -517,6 +517,10 @@ static inline int cpu_has_feature(unsigned long feature)
 		& feature);
 }
 
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+#define HBP_NUM 1
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/hw_breakpoint.h b/arch/powerpc/include/asm/hw_breakpoint.h
new file mode 100644
index 000000000000..b111713b593e
--- /dev/null
+++ b/arch/powerpc/include/asm/hw_breakpoint.h
@@ -0,0 +1,73 @@
+/*
+ * PowerPC BookIII S hardware breakpoint definitions
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2010, IBM Corporation.
+ * Author: K.Prasad <prasad@linux.vnet.ibm.com>
+ *
+ */
+
+#ifndef _PPC_BOOK3S_64_HW_BREAKPOINT_H
+#define _PPC_BOOK3S_64_HW_BREAKPOINT_H
+
+#ifdef	__KERNEL__
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+
+struct arch_hw_breakpoint {
+	u8		len; /* length of the target data symbol */
+	int		type;
+	unsigned long	address;
+};
+
+#include <linux/kdebug.h>
+#include <asm/reg.h>
+#include <asm/system.h>
+
+static inline int hw_breakpoint_slots(int type)
+{
+	return HBP_NUM;
+}
+struct perf_event;
+struct pmu;
+struct perf_sample_data;
+
+#define HW_BREAKPOINT_ALIGN 0x7
+/* Maximum permissible length of any HW Breakpoint */
+#define HW_BREAKPOINT_LEN 0x8
+
+extern int arch_bp_generic_fields(int type, int *gen_bp_type);
+extern int arch_check_bp_in_kernelspace(struct perf_event *bp);
+extern int arch_validate_hwbkpt_settings(struct perf_event *bp);
+extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
+						unsigned long val, void *data);
+int arch_install_hw_breakpoint(struct perf_event *bp);
+void arch_uninstall_hw_breakpoint(struct perf_event *bp);
+void hw_breakpoint_pmu_read(struct perf_event *bp);
+extern void flush_ptrace_hw_breakpoint(struct task_struct *tsk);
+
+extern struct pmu perf_ops_bp;
+extern void ptrace_triggered(struct perf_event *bp, int nmi,
+			struct perf_sample_data *data, struct pt_regs *regs);
+static inline void hw_breakpoint_disable(void)
+{
+	set_dabr(0);
+}
+
+#else	/* CONFIG_HAVE_HW_BREAKPOINT */
+static inline void hw_breakpoint_disable(void) { }
+#endif	/* CONFIG_HAVE_HW_BREAKPOINT */
+#endif	/* __KERNEL__ */
+#endif	/* _PPC_BOOK3S_64_HW_BREAKPOINT_H */
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index 7492fe8ad6e4..19c05b0f74be 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -209,6 +209,14 @@ struct thread_struct {
 #ifdef CONFIG_PPC64
 	unsigned long	start_tb;	/* Start purr when proc switched in */
 	unsigned long	accum_tb;	/* Total accumilated purr for process */
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+	struct perf_event *ptrace_bps[HBP_NUM];
+	/*
+	 * Helps identify source of single-step exception and subsequent
+	 * hw-breakpoint enablement
+	 */
+	struct perf_event *last_hit_ubp;
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
 #endif
 	unsigned long	dabr;		/* Data address breakpoint register */
 #ifdef CONFIG_ALTIVEC
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 58d0572de6f9..01006040f59e 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -34,6 +34,7 @@ obj-y				+= vdso32/
 obj-$(CONFIG_PPC64)		+= setup_64.o sys_ppc32.o \
 				   signal_64.o ptrace32.o \
 				   paca.o nvram_64.o firmware.o
+obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
 obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_ppc970.o cpu_setup_pa6t.o
 obj64-$(CONFIG_RELOCATABLE)	+= reloc_64.o
 obj-$(CONFIG_PPC_BOOK3E_64)	+= exceptions-64e.o
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 3e423fbad6bc..f53029a01554 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -828,6 +828,7 @@ END_FW_FTR_SECTION_IFCLR(FW_FEATURE_ISERIES)
 
 /* We have a data breakpoint exception - handle it */
 handle_dabr_fault:
+	bl	.save_nvgprs
 	ld      r4,_DAR(r1)
 	ld      r5,_DSISR(r1)
 	addi    r3,r1,STACK_FRAME_OVERHEAD
diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
new file mode 100644
index 000000000000..7a2ad5e84c16
--- /dev/null
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -0,0 +1,325 @@
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers. Derived from
+ * "arch/x86/kernel/hw_breakpoint.c"
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2010 IBM Corporation
+ * Author: K.Prasad <prasad@linux.vnet.ibm.com>
+ *
+ */
+
+#include <linux/hw_breakpoint.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/percpu.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+#include <asm/sstep.h>
+#include <asm/uaccess.h>
+
+/*
+ * Stores the breakpoints currently in use on each breakpoint address
+ * register for every cpu
+ */
+static DEFINE_PER_CPU(struct perf_event *, bp_per_reg);
+
+/*
+ * Install a perf counter breakpoint.
+ *
+ * We seek a free debug address register and use it for this
+ * breakpoint.
+ *
+ * Atomic: we hold the counter->ctx->lock and we only handle variables
+ * and registers local to this cpu.
+ */
+int arch_install_hw_breakpoint(struct perf_event *bp)
+{
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+	struct perf_event **slot = &__get_cpu_var(bp_per_reg);
+
+	*slot = bp;
+
+	/*
+	 * Do not install DABR values if the instruction must be single-stepped.
+	 * If so, DABR will be populated in single_step_dabr_instruction().
+	 */
+	if (current->thread.last_hit_ubp != bp)
+		set_dabr(info->address | info->type | DABR_TRANSLATION);
+
+	return 0;
+}
+
+/*
+ * Uninstall the breakpoint contained in the given counter.
+ *
+ * First we search the debug address register it uses and then we disable
+ * it.
+ *
+ * Atomic: we hold the counter->ctx->lock and we only handle variables
+ * and registers local to this cpu.
+ */
+void arch_uninstall_hw_breakpoint(struct perf_event *bp)
+{
+	struct perf_event **slot = &__get_cpu_var(bp_per_reg);
+
+	if (*slot != bp) {
+		WARN_ONCE(1, "Can't find the breakpoint");
+		return;
+	}
+
+	*slot = NULL;
+	set_dabr(0);
+}
+
+/*
+ * Perform cleanup of arch-specific counters during unregistration
+ * of the perf-event
+ */
+void arch_unregister_hw_breakpoint(struct perf_event *bp)
+{
+	/*
+	 * If the breakpoint is unregistered between a hw_breakpoint_handler()
+	 * and the single_step_dabr_instruction(), then cleanup the breakpoint
+	 * restoration variables to prevent dangling pointers.
+	 */
+	if (bp->ctx->task)
+		bp->ctx->task->thread.last_hit_ubp = NULL;
+}
+
+/*
+ * Check for virtual address in kernel space.
+ */
+int arch_check_bp_in_kernelspace(struct perf_event *bp)
+{
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+
+	return is_kernel_addr(info->address);
+}
+
+int arch_bp_generic_fields(int type, int *gen_bp_type)
+{
+	switch (type) {
+	case DABR_DATA_READ:
+		*gen_bp_type = HW_BREAKPOINT_R;
+		break;
+	case DABR_DATA_WRITE:
+		*gen_bp_type = HW_BREAKPOINT_W;
+		break;
+	case (DABR_DATA_WRITE | DABR_DATA_READ):
+		*gen_bp_type = (HW_BREAKPOINT_W | HW_BREAKPOINT_R);
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * Validate the arch-specific HW Breakpoint register settings
+ */
+int arch_validate_hwbkpt_settings(struct perf_event *bp)
+{
+	int ret = -EINVAL;
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+
+	if (!bp)
+		return ret;
+
+	switch (bp->attr.bp_type) {
+	case HW_BREAKPOINT_R:
+		info->type = DABR_DATA_READ;
+		break;
+	case HW_BREAKPOINT_W:
+		info->type = DABR_DATA_WRITE;
+		break;
+	case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
+		info->type = (DABR_DATA_READ | DABR_DATA_WRITE);
+		break;
+	default:
+		return ret;
+	}
+
+	info->address = bp->attr.bp_addr;
+	info->len = bp->attr.bp_len;
+
+	/*
+	 * Since breakpoint length can be a maximum of HW_BREAKPOINT_LEN(8)
+	 * and breakpoint addresses are aligned to nearest double-word
+	 * HW_BREAKPOINT_ALIGN by rounding off to the lower address, the
+	 * 'symbolsize' should satisfy the check below.
+	 */
+	if (info->len >
+	    (HW_BREAKPOINT_LEN - (info->address & HW_BREAKPOINT_ALIGN)))
+		return -EINVAL;
+	return 0;
+}
+
+/*
+ * Handle debug exception notifications.
+ */
+int __kprobes hw_breakpoint_handler(struct die_args *args)
+{
+	bool is_ptrace_bp = false;
+	int rc = NOTIFY_STOP;
+	struct perf_event *bp;
+	struct pt_regs *regs = args->regs;
+	int stepped = 1;
+	struct arch_hw_breakpoint *info;
+	unsigned int instr;
+
+	/* Disable breakpoints during exception handling */
+	set_dabr(0);
+	/*
+	 * The counter may be concurrently released but that can only
+	 * occur from a call_rcu() path. We can then safely fetch
+	 * the breakpoint, use its callback, touch its counter
+	 * while we are in an rcu_read_lock() path.
+	 */
+	rcu_read_lock();
+
+	bp = __get_cpu_var(bp_per_reg);
+	if (!bp)
+		goto out;
+	info = counter_arch_bp(bp);
+	is_ptrace_bp = (bp->overflow_handler == ptrace_triggered) ?
+			true : false;
+
+	/*
+	 * Return early after invoking user-callback function without restoring
+	 * DABR if the breakpoint is from ptrace which always operates in
+	 * one-shot mode. The ptrace-ed process will receive the SIGTRAP signal
+	 * generated in do_dabr().
+	 */
+	if (is_ptrace_bp) {
+		perf_bp_event(bp, regs);
+		rc = NOTIFY_DONE;
+		goto out;
+	}
+
+	/* Do not emulate user-space instructions, instead single-step them */
+	if (user_mode(regs)) {
+		bp->ctx->task->thread.last_hit_ubp = bp;
+		regs->msr |= MSR_SE;
+		goto out;
+	}
+
+	stepped = 0;
+	instr = 0;
+	if (!__get_user_inatomic(instr, (unsigned int *) regs->nip))
+		stepped = emulate_step(regs, instr);
+
+	/*
+	 * emulate_step() could not execute it. We've failed in reliably
+	 * handling the hw-breakpoint. Unregister it and throw a warning
+	 * message to let the user know about it.
+	 */
+	if (!stepped) {
+		WARN(1, "Unable to handle hardware breakpoint. Breakpoint at "
+			"0x%lx will be disabled.", info->address);
+		perf_event_disable(bp);
+		goto out;
+	}
+	/*
+	 * As a policy, the callback is invoked in a 'trigger-after-execute'
+	 * fashion
+	 */
+	perf_bp_event(bp, regs);
+
+	set_dabr(info->address | info->type | DABR_TRANSLATION);
+out:
+	rcu_read_unlock();
+	return rc;
+}
+
+/*
+ * Handle single-step exceptions following a DABR hit.
+ */
+int __kprobes single_step_dabr_instruction(struct die_args *args)
+{
+	struct pt_regs *regs = args->regs;
+	struct perf_event *bp = NULL;
+	struct arch_hw_breakpoint *bp_info;
+
+	bp = current->thread.last_hit_ubp;
+	/*
+	 * Check if we are single-stepping as a result of a
+	 * previous HW Breakpoint exception
+	 */
+	if (!bp)
+		return NOTIFY_DONE;
+
+	bp_info = counter_arch_bp(bp);
+
+	/*
+	 * We shall invoke the user-defined callback function in the single
+	 * stepping handler to confirm to 'trigger-after-execute' semantics
+	 */
+	perf_bp_event(bp, regs);
+
+	/*
+	 * Do not disable MSR_SE if the process was already in
+	 * single-stepping mode.
+	 */
+	if (!test_thread_flag(TIF_SINGLESTEP))
+		regs->msr &= ~MSR_SE;
+
+	set_dabr(bp_info->address | bp_info->type | DABR_TRANSLATION);
+	current->thread.last_hit_ubp = NULL;
+	return NOTIFY_STOP;
+}
+
+/*
+ * Handle debug exception notifications.
+ */
+int __kprobes hw_breakpoint_exceptions_notify(
+		struct notifier_block *unused, unsigned long val, void *data)
+{
+	int ret = NOTIFY_DONE;
+
+	switch (val) {
+	case DIE_DABR_MATCH:
+		ret = hw_breakpoint_handler(data);
+		break;
+	case DIE_SSTEP:
+		ret = single_step_dabr_instruction(data);
+		break;
+	}
+
+	return ret;
+}
+
+/*
+ * Release the user breakpoints used by ptrace
+ */
+void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
+{
+	struct thread_struct *t = &tsk->thread;
+
+	unregister_hw_breakpoint(t->ptrace_bps[0]);
+	t->ptrace_bps[0] = NULL;
+}
+
+void hw_breakpoint_pmu_read(struct perf_event *bp)
+{
+	/* TODO */
+}
diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c
index 26f9900f773c..6c7c546aa1be 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -25,6 +25,7 @@
 #include <asm/sections.h>	/* _end */
 #include <asm/prom.h>
 #include <asm/smp.h>
+#include <asm/hw_breakpoint.h>
 
 int default_machine_kexec_prepare(struct kimage *image)
 {
@@ -165,6 +166,7 @@ static void kexec_smp_down(void *arg)
 	while(kexec_all_irq_disabled == 0)
 		cpu_relax();
 	mb(); /* make sure all irqs are disabled before this */
+	hw_breakpoint_disable();
 	/*
 	 * Now every CPU has IRQs off, we can clear out any pending
 	 * IPIs and be sure that no more will come in after this.
@@ -180,6 +182,7 @@ static void kexec_prepare_cpus_wait(int wait_state)
 {
 	int my_cpu, i, notified=-1;
 
+	hw_breakpoint_disable();
 	my_cpu = get_cpu();
 	/* Make sure each CPU has atleast made it to the state we need */
 	for (i=0; i < NR_CPUS; i++) {
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 9d255b4f0a0e..cbf3521d50b6 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -37,6 +37,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/personality.h>
 #include <linux/random.h>
+#include <linux/hw_breakpoint.h>
 
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
@@ -462,8 +463,14 @@ struct task_struct *__switch_to(struct task_struct *prev,
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
 	switch_booke_debug_regs(&new->thread);
 #else
+/*
+ * For PPC_BOOK3S_64, we use the hw-breakpoint interfaces that would
+ * schedule DABR
+ */
+#ifndef CONFIG_HAVE_HW_BREAKPOINT
 	if (unlikely(__get_cpu_var(current_dabr) != new->thread.dabr))
 		set_dabr(new->thread.dabr);
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
 #endif
 
 
@@ -642,7 +649,11 @@ void flush_thread(void)
 {
 	discard_lazy_cpu_state();
 
+#ifdef CONFIG_HAVE_HW_BREAKPOINTS
+	flush_ptrace_hw_breakpoint(current);
+#else /* CONFIG_HAVE_HW_BREAKPOINTS */
 	set_debug_reg_defaults(&current->thread);
+#endif /* CONFIG_HAVE_HW_BREAKPOINTS */
 }
 
 void
@@ -660,6 +671,9 @@ void prepare_to_copy(struct task_struct *tsk)
 	flush_altivec_to_thread(current);
 	flush_vsx_to_thread(current);
 	flush_spe_to_thread(current);
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+	flush_ptrace_hw_breakpoint(tsk);
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
 }
 
 /*
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 7a0c0199ea28..11f3cd9c832f 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -32,6 +32,8 @@
 #ifdef CONFIG_PPC32
 #include <linux/module.h>
 #endif
+#include <linux/hw_breakpoint.h>
+#include <linux/perf_event.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -866,9 +868,34 @@ void user_disable_single_step(struct task_struct *task)
 	clear_tsk_thread_flag(task, TIF_SINGLESTEP);
 }
 
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+void ptrace_triggered(struct perf_event *bp, int nmi,
+		      struct perf_sample_data *data, struct pt_regs *regs)
+{
+	struct perf_event_attr attr;
+
+	/*
+	 * Disable the breakpoint request here since ptrace has defined a
+	 * one-shot behaviour for breakpoint exceptions in PPC64.
+	 * The SIGTRAP signal is generated automatically for us in do_dabr().
+	 * We don't have to do anything about that here
+	 */
+	attr = bp->attr;
+	attr.disabled = true;
+	modify_user_hw_breakpoint(bp, &attr);
+}
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
+
 int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
 			       unsigned long data)
 {
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+	int ret;
+	struct thread_struct *thread = &(task->thread);
+	struct perf_event *bp;
+	struct perf_event_attr attr;
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
+
 	/* For ppc64 we support one DABR and no IABR's at the moment (ppc64).
 	 *  For embedded processors we support one DAC and no IAC's at the
 	 *  moment.
@@ -896,6 +923,43 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
 	/* Ensure breakpoint translation bit is set */
 	if (data && !(data & DABR_TRANSLATION))
 		return -EIO;
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+	bp = thread->ptrace_bps[0];
+	if ((!data) || !(data & (DABR_DATA_WRITE | DABR_DATA_READ))) {
+		if (bp) {
+			unregister_hw_breakpoint(bp);
+			thread->ptrace_bps[0] = NULL;
+		}
+		return 0;
+	}
+	if (bp) {
+		attr = bp->attr;
+		attr.bp_addr = data & ~HW_BREAKPOINT_ALIGN;
+		arch_bp_generic_fields(data &
+					(DABR_DATA_WRITE | DABR_DATA_READ),
+							&attr.bp_type);
+		ret =  modify_user_hw_breakpoint(bp, &attr);
+		if (ret)
+			return ret;
+		thread->ptrace_bps[0] = bp;
+		thread->dabr = data;
+		return 0;
+	}
+
+	/* Create a new breakpoint request if one doesn't exist already */
+	hw_breakpoint_init(&attr);
+	attr.bp_addr = data & ~HW_BREAKPOINT_ALIGN;
+	arch_bp_generic_fields(data & (DABR_DATA_WRITE | DABR_DATA_READ),
+								&attr.bp_type);
+
+	thread->ptrace_bps[0] = bp = register_user_hw_breakpoint(&attr,
+							ptrace_triggered, task);
+	if (IS_ERR(bp)) {
+		thread->ptrace_bps[0] = NULL;
+		return PTR_ERR(bp);
+	}
+
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
 
 	/* Move contents to the DABR register */
 	task->thread.dabr = data;
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 7581dbffa18e..a52ed2e47ac6 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_PPC64)	+= copypage_64.o copyuser_64.o \
 			   memcpy_64.o usercopy_64.o mem_64.o string.o
 obj-$(CONFIG_XMON)	+= sstep.o ldstfp.o
 obj-$(CONFIG_KPROBES)	+= sstep.o ldstfp.o
+obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= sstep.o ldstfp.o
 
 ifeq ($(CONFIG_PPC64),y)
 obj-$(CONFIG_SMP)	+= locks.o
-- 
cgit v1.2.3


From 06532a6743d83fac4b79389fc8c86c88cb4e3302 Mon Sep 17 00:00:00 2001
From: "K.Prasad" <prasad@linux.vnet.ibm.com>
Date: Tue, 15 Jun 2010 11:35:41 +0530
Subject: powerpc, hw_breakpoint: Enable hw-breakpoints while handling
 intervening signals

A signal delivered between a hw_breakpoint_handler() and the
single_step_dabr_instruction() will not have the breakpoint active
while the signal handler is running -- the signal delivery will
set up a new MSR value which will not have MSR_SE set, so we
won't get the signal step interrupt until and unless the signal
handler returns (which it may never do).

To fix this, we restore the breakpoint when delivering a signal --
we clear the MSR_SE bit and set the DABR again.  If the signal
handler returns, the DABR interrupt will occur again when the
instruction that we were originally trying to single-step gets
re-executed.

[Paul Mackerras <paulus@samba.org> pointed out the need to do this.]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/hw_breakpoint.h |  3 +++
 arch/powerpc/kernel/hw_breakpoint.c      | 18 ++++++++++++++++++
 arch/powerpc/kernel/signal.c             |  3 +++
 3 files changed, 24 insertions(+)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/hw_breakpoint.h b/arch/powerpc/include/asm/hw_breakpoint.h
index b111713b593e..6576bad1069c 100644
--- a/arch/powerpc/include/asm/hw_breakpoint.h
+++ b/arch/powerpc/include/asm/hw_breakpoint.h
@@ -65,9 +65,12 @@ static inline void hw_breakpoint_disable(void)
 {
 	set_dabr(0);
 }
+extern void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs);
 
 #else	/* CONFIG_HAVE_HW_BREAKPOINT */
 static inline void hw_breakpoint_disable(void) { }
+static inline void thread_change_pc(struct task_struct *tsk,
+					struct pt_regs *regs) { }
 #endif	/* CONFIG_HAVE_HW_BREAKPOINT */
 #endif	/* __KERNEL__ */
 #endif	/* _PPC_BOOK3S_64_HW_BREAKPOINT_H */
diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index 7a2ad5e84c16..7bd01a56d194 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -174,6 +174,24 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
 	return 0;
 }
 
+/*
+ * Restores the breakpoint on the debug registers.
+ * Invoke this function if it is known that the execution context is
+ * about to change to cause loss of MSR_SE settings.
+ */
+void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs)
+{
+	struct arch_hw_breakpoint *info;
+
+	if (likely(!tsk->thread.last_hit_ubp))
+		return;
+
+	info = counter_arch_bp(tsk->thread.last_hit_ubp);
+	regs->msr &= ~MSR_SE;
+	set_dabr(info->address | info->type | DABR_TRANSLATION);
+	tsk->thread.last_hit_ubp = NULL;
+}
+
 /*
  * Handle debug exception notifications.
  */
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index a0afb555a7c9..7109f5b1baa8 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -11,6 +11,7 @@
 
 #include <linux/tracehook.h>
 #include <linux/signal.h>
+#include <asm/hw_breakpoint.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 
@@ -149,6 +150,8 @@ static int do_signal_pending(sigset_t *oldset, struct pt_regs *regs)
 	if (current->thread.dabr)
 		set_dabr(current->thread.dabr);
 #endif
+	/* Re-enable the breakpoints for the signal stack */
+	thread_change_pc(current, regs);
 
 	if (is32) {
         	if (ka.sa.sa_flags & SA_SIGINFO)
-- 
cgit v1.2.3


From e3e94084adb5610987283367574ebc771e8206e1 Mon Sep 17 00:00:00 2001
From: "K.Prasad" <prasad@linux.vnet.ibm.com>
Date: Tue, 15 Jun 2010 11:36:12 +0530
Subject: powerpc, hw_breakpoint: Discard extraneous interrupt due to accesses
 outside symbol length

Many a times, the requested breakpoint length can be less than the
fixed breakpoint length i.e. 8 bytes supported by PowerPC 64-bit
server (Book III S) processors.  This could lead to extraneous
interrupts resulting in false breakpoint notifications.  This
detects and discards such interrupts for non-ptrace requests.
We don't change ptrace behaviour to avoid breaking compatability.

[Suggestion from Paul Mackerras <paulus@samba.org> to add a new flag in
'struct arch_hw_breakpoint' to identify extraneous interrupts]

Signed-off-by: K.Prasad <prasad@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/hw_breakpoint.h |  1 +
 arch/powerpc/kernel/hw_breakpoint.c      | 23 +++++++++++++++++++++--
 2 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/hw_breakpoint.h b/arch/powerpc/include/asm/hw_breakpoint.h
index 6576bad1069c..ea87f8ae7bdb 100644
--- a/arch/powerpc/include/asm/hw_breakpoint.h
+++ b/arch/powerpc/include/asm/hw_breakpoint.h
@@ -27,6 +27,7 @@
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 
 struct arch_hw_breakpoint {
+	bool		extraneous_interrupt;
 	u8		len; /* length of the target data symbol */
 	int		type;
 	unsigned long	address;
diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index 7bd01a56d194..ed39805a3b84 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -204,6 +204,7 @@ int __kprobes hw_breakpoint_handler(struct die_args *args)
 	int stepped = 1;
 	struct arch_hw_breakpoint *info;
 	unsigned int instr;
+	unsigned long dar = regs->dar;
 
 	/* Disable breakpoints during exception handling */
 	set_dabr(0);
@@ -234,6 +235,22 @@ int __kprobes hw_breakpoint_handler(struct die_args *args)
 		goto out;
 	}
 
+	/*
+	 * Verify if dar lies within the address range occupied by the symbol
+	 * being watched to filter extraneous exceptions.
+	 */
+	if (!((bp->attr.bp_addr <= dar) &&
+	     (dar <= (bp->attr.bp_addr + bp->attr.bp_len)))) {
+		/*
+		 * This exception is triggered not because of a memory access
+		 * on the monitored variable but in the double-word address
+		 * range in which it is contained. We will consume this
+		 * exception, considering it as 'noise'.
+		 */
+		info->extraneous_interrupt = true;
+	} else
+		info->extraneous_interrupt = false;
+
 	/* Do not emulate user-space instructions, instead single-step them */
 	if (user_mode(regs)) {
 		bp->ctx->task->thread.last_hit_ubp = bp;
@@ -261,7 +278,8 @@ int __kprobes hw_breakpoint_handler(struct die_args *args)
 	 * As a policy, the callback is invoked in a 'trigger-after-execute'
 	 * fashion
 	 */
-	perf_bp_event(bp, regs);
+	if (!info->extraneous_interrupt)
+		perf_bp_event(bp, regs);
 
 	set_dabr(info->address | info->type | DABR_TRANSLATION);
 out:
@@ -292,7 +310,8 @@ int __kprobes single_step_dabr_instruction(struct die_args *args)
 	 * We shall invoke the user-defined callback function in the single
 	 * stepping handler to confirm to 'trigger-after-execute' semantics
 	 */
-	perf_bp_event(bp, regs);
+	if (!bp_info->extraneous_interrupt)
+		perf_bp_event(bp, regs);
 
 	/*
 	 * Do not disable MSR_SE if the process was already in
-- 
cgit v1.2.3


From d09ec7387184eba9e3030496f0451204090ff610 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 29 Jun 2010 12:50:32 +1000
Subject: powerpc, hw_breakpoint: Tell generic code we have no instruction
 breakpoints

At present, hw_breakpoint_slots() returns 1 regardless of what
type of breakpoint is specified in the type argument.  Since we
don't define CONFIG_HAVE_MIXED_BREAKPOINTS_REGS, there are
separate values for TYPE_INST and TYPE_DATA, and hw_breakpoint_slots()
returns 1 for both, effectively advertising instruction breakpoint
support which doesn't exist.

This fixes it by making hw_breakpoint_slots return 1 for TYPE_DATA
and 0 for TYPE_INST.  This moves hw_breakpoint_slots() from the
powerpc hw_breakpoint.h to hw_breakpoint.c because the definitions
of TYPE_INST and TYPE_DATA aren't available in <asm/hw_breakpoint.h>.
They are defined in <linux/hw_breakpoint.h> but we can't include
that header in <asm/hw_breakpoint.h>, and nor can we rely on
<linux/hw_breakpoint.h> being included before <asm/hw_breakpoint.h>.
Since hw_breakpoint_slots() is only called at boot time, there is
no performance impact from making it a real function rather than
a static inline.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/hw_breakpoint.h |  5 +----
 arch/powerpc/kernel/hw_breakpoint.c      | 10 ++++++++++
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/hw_breakpoint.h b/arch/powerpc/include/asm/hw_breakpoint.h
index ea87f8ae7bdb..1c33ec17ca36 100644
--- a/arch/powerpc/include/asm/hw_breakpoint.h
+++ b/arch/powerpc/include/asm/hw_breakpoint.h
@@ -37,10 +37,6 @@ struct arch_hw_breakpoint {
 #include <asm/reg.h>
 #include <asm/system.h>
 
-static inline int hw_breakpoint_slots(int type)
-{
-	return HBP_NUM;
-}
 struct perf_event;
 struct pmu;
 struct perf_sample_data;
@@ -49,6 +45,7 @@ struct perf_sample_data;
 /* Maximum permissible length of any HW Breakpoint */
 #define HW_BREAKPOINT_LEN 0x8
 
+extern int hw_breakpoint_slots(int type);
 extern int arch_bp_generic_fields(int type, int *gen_bp_type);
 extern int arch_check_bp_in_kernelspace(struct perf_event *bp);
 extern int arch_validate_hwbkpt_settings(struct perf_event *bp);
diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index 5a1d55d06a08..5ecd0401cdb1 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -43,6 +43,16 @@
  */
 static DEFINE_PER_CPU(struct perf_event *, bp_per_reg);
 
+/*
+ * Returns total number of data or instruction breakpoints available.
+ */
+int hw_breakpoint_slots(int type)
+{
+	if (type == TYPE_DATA)
+		return HBP_NUM;
+	return 0;		/* no instruction breakpoints available */
+}
+
 /*
  * Install a perf counter breakpoint.
  *
-- 
cgit v1.2.3


From 8fd63a9ea7528463211a6c88d500c51851d960c8 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Sun, 20 Jun 2010 19:03:08 +0000
Subject: powerpc: Rework VDSO gettimeofday to prevent time going backwards

Currently it is possible for userspace to see the result of
gettimeofday() going backwards by 1 microsecond, assuming that
userspace is using the gettimeofday() in the VDSO.  The VDSO
gettimeofday() algorithm computes the time in "xsecs", which are
units of 2^-20 seconds, or approximately 0.954 microseconds,
using the algorithm

	now = (timebase - tb_orig_stamp) * tb_to_xs + stamp_xsec

and then converts the time in xsecs to seconds and microseconds.

The kernel updates the tb_orig_stamp and stamp_xsec values every
tick in update_vsyscall().  If the length of the tick is not an
integer number of xsecs, then some precision is lost in converting
the current time to xsecs.  For example, with CONFIG_HZ=1000, the
tick is 1ms long, which is 1048.576 xsecs.  That means that
stamp_xsec will advance by either 1048 or 1049 on each tick.
With the right conditions, it is possible for userspace to get
(timebase - tb_orig_stamp) * tb_to_xs being 1049 if the kernel is
slightly late in updating the vdso_datapage, and then for stamp_xsec
to advance by 1048 when the kernel does update it, and for userspace
to then see (timebase - tb_orig_stamp) * tb_to_xs being zero due to
integer truncation.  The result is that time appears to go backwards
by 1 microsecond.

To fix this we change the VDSO gettimeofday to use a new field in the
VDSO datapage which stores the nanoseconds part of the time as a
fractional number of seconds in a 0.32 binary fraction format.
(Or put another way, as a 32-bit number in units of 0.23283 ns.)
This is convenient because we can use the mulhwu instruction to
convert it to either microseconds or nanoseconds.

Since it turns out that computing the time of day using this new field
is simpler than either using stamp_xsec (as gettimeofday does) or
stamp_xtime.tv_nsec (as clock_gettime does), this converts both
gettimeofday and clock_gettime to use the new field.  The existing
__do_get_tspec function is converted to use the new field and take
a parameter in r7 that indicates the desired resolution, 1,000,000
for microseconds or 1,000,000,000 for nanoseconds.  The __do_get_xsec
function is then unused and is deleted.

The new algorithm is

	now = ((timebase - tb_orig_stamp) << 12) * tb_to_xs
		+ (stamp_xtime_seconds << 32) + stamp_sec_fraction

with 'now' in units of 2^-32 seconds.  That is then converted to
seconds and either microseconds or nanoseconds with

	seconds = now >> 32
	partseconds = ((now & 0xffffffff) * resolution) >> 32

The 32-bit VDSO code also makes a further simplification: it ignores
the bottom 32 bits of the tb_to_xs value, which is a 0.64 format binary
fraction.  Doing so gets rid of 4 multiply instructions.  Assuming
a timebase frequency of 1GHz or less and an update interval of no
more than 10ms, the upper 32 bits of tb_to_xs will be at least
4503599, so the error from ignoring the low 32 bits will be at most
2.2ns, which is more than an order of magnitude less than the time
taken to do gettimeofday or clock_gettime on our fastest processors,
so there is no possibility of seeing inconsistent values due to this.

This also moves update_gtod() down next to its only caller, and makes
update_vsyscall use the time passed in via the wall_time argument rather
than accessing xtime directly.  At present, wall_time always points to
xtime, but that could change in future.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/vdso_datapage.h  |   2 +
 arch/powerpc/kernel/asm-offsets.c         |   1 +
 arch/powerpc/kernel/time.c                |  61 +++++-----
 arch/powerpc/kernel/vdso32/gettimeofday.S | 184 +++++++-----------------------
 arch/powerpc/kernel/vdso64/gettimeofday.S |  88 ++++----------
 5 files changed, 99 insertions(+), 237 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/vdso_datapage.h b/arch/powerpc/include/asm/vdso_datapage.h
index 13c2c283e178..08679c5319b8 100644
--- a/arch/powerpc/include/asm/vdso_datapage.h
+++ b/arch/powerpc/include/asm/vdso_datapage.h
@@ -85,6 +85,7 @@ struct vdso_data {
 	__s32 wtom_clock_sec;			/* Wall to monotonic clock */
 	__s32 wtom_clock_nsec;
 	struct timespec stamp_xtime;	/* xtime as at tb_orig_stamp */
+	__u32 stamp_sec_fraction;	/* fractional seconds of stamp_xtime */
    	__u32 syscall_map_64[SYSCALL_MAP_SIZE]; /* map of syscalls  */
    	__u32 syscall_map_32[SYSCALL_MAP_SIZE]; /* map of syscalls */
 };
@@ -105,6 +106,7 @@ struct vdso_data {
 	__s32 wtom_clock_sec;			/* Wall to monotonic clock */
 	__s32 wtom_clock_nsec;
 	struct timespec stamp_xtime;	/* xtime as at tb_orig_stamp */
+	__u32 stamp_sec_fraction;	/* fractional seconds of stamp_xtime */
    	__u32 syscall_map_32[SYSCALL_MAP_SIZE]; /* map of syscalls */
 	__u32 dcache_block_size;	/* L1 d-cache block size     */
 	__u32 icache_block_size;	/* L1 i-cache block size     */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 496cc5b3984f..acbbac6aaa22 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -342,6 +342,7 @@ int main(void)
 	DEFINE(WTOM_CLOCK_SEC, offsetof(struct vdso_data, wtom_clock_sec));
 	DEFINE(WTOM_CLOCK_NSEC, offsetof(struct vdso_data, wtom_clock_nsec));
 	DEFINE(STAMP_XTIME, offsetof(struct vdso_data, stamp_xtime));
+	DEFINE(STAMP_SEC_FRAC, offsetof(struct vdso_data, stamp_sec_fraction));
 	DEFINE(CFG_ICACHE_BLOCKSZ, offsetof(struct vdso_data, icache_block_size));
 	DEFINE(CFG_DCACHE_BLOCKSZ, offsetof(struct vdso_data, dcache_block_size));
 	DEFINE(CFG_ICACHE_LOGBLOCKSZ, offsetof(struct vdso_data, icache_log_block_size));
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 0441bbdadbd1..5adebaf47f13 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -423,30 +423,6 @@ void udelay(unsigned long usecs)
 }
 EXPORT_SYMBOL(udelay);
 
-static inline void update_gtod(u64 new_tb_stamp, u64 new_stamp_xsec,
-			       u64 new_tb_to_xs)
-{
-	/*
-	 * tb_update_count is used to allow the userspace gettimeofday code
-	 * to assure itself that it sees a consistent view of the tb_to_xs and
-	 * stamp_xsec variables.  It reads the tb_update_count, then reads
-	 * tb_to_xs and stamp_xsec and then reads tb_update_count again.  If
-	 * the two values of tb_update_count match and are even then the
-	 * tb_to_xs and stamp_xsec values are consistent.  If not, then it
-	 * loops back and reads them again until this criteria is met.
-	 * We expect the caller to have done the first increment of
-	 * vdso_data->tb_update_count already.
-	 */
-	vdso_data->tb_orig_stamp = new_tb_stamp;
-	vdso_data->stamp_xsec = new_stamp_xsec;
-	vdso_data->tb_to_xs = new_tb_to_xs;
-	vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec;
-	vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec;
-	vdso_data->stamp_xtime = xtime;
-	smp_wmb();
-	++(vdso_data->tb_update_count);
-}
-
 #ifdef CONFIG_SMP
 unsigned long profile_pc(struct pt_regs *regs)
 {
@@ -873,10 +849,37 @@ static cycle_t timebase_read(struct clocksource *cs)
 	return (cycle_t)get_tb();
 }
 
+static inline void update_gtod(u64 new_tb_stamp, u64 new_stamp_xsec,
+			       u64 new_tb_to_xs, struct timespec *now,
+			       u32 frac_sec)
+{
+	/*
+	 * tb_update_count is used to allow the userspace gettimeofday code
+	 * to assure itself that it sees a consistent view of the tb_to_xs and
+	 * stamp_xsec variables.  It reads the tb_update_count, then reads
+	 * tb_to_xs and stamp_xsec and then reads tb_update_count again.  If
+	 * the two values of tb_update_count match and are even then the
+	 * tb_to_xs and stamp_xsec values are consistent.  If not, then it
+	 * loops back and reads them again until this criteria is met.
+	 * We expect the caller to have done the first increment of
+	 * vdso_data->tb_update_count already.
+	 */
+	vdso_data->tb_orig_stamp = new_tb_stamp;
+	vdso_data->stamp_xsec = new_stamp_xsec;
+	vdso_data->tb_to_xs = new_tb_to_xs;
+	vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec;
+	vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec;
+	vdso_data->stamp_xtime = *now;
+	vdso_data->stamp_sec_fraction = frac_sec;
+	smp_wmb();
+	++(vdso_data->tb_update_count);
+}
+
 void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
 		     u32 mult)
 {
 	u64 t2x, stamp_xsec;
+	u32 frac_sec;
 
 	if (clock != &clocksource_timebase)
 		return;
@@ -888,10 +891,14 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
 	/* XXX this assumes clock->shift == 22 */
 	/* 4611686018 ~= 2^(20+64-22) / 1e9 */
 	t2x = (u64) mult * 4611686018ULL;
-	stamp_xsec = (u64) xtime.tv_nsec * XSEC_PER_SEC;
+	stamp_xsec = (u64) wall_time->tv_nsec * XSEC_PER_SEC;
 	do_div(stamp_xsec, 1000000000);
-	stamp_xsec += (u64) xtime.tv_sec * XSEC_PER_SEC;
-	update_gtod(clock->cycle_last, stamp_xsec, t2x);
+	stamp_xsec += (u64) wall_time->tv_sec * XSEC_PER_SEC;
+
+	BUG_ON(wall_time->tv_nsec >= NSEC_PER_SEC);
+	/* this is tv_nsec / 1e9 as a 0.32 fraction */
+	frac_sec = ((u64) wall_time->tv_nsec * 18446744073ULL) >> 32;
+	update_gtod(clock->cycle_last, stamp_xsec, t2x, wall_time, frac_sec);
 }
 
 void update_vsyscall_tz(void)
diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S
index ee038d4bf252..4ee09ee2e836 100644
--- a/arch/powerpc/kernel/vdso32/gettimeofday.S
+++ b/arch/powerpc/kernel/vdso32/gettimeofday.S
@@ -19,8 +19,10 @@
 /* Offset for the low 32-bit part of a field of long type */
 #ifdef CONFIG_PPC64
 #define LOPART	4
+#define TSPEC_TV_SEC	TSPC64_TV_SEC+LOPART
 #else
 #define LOPART	0
+#define TSPEC_TV_SEC	TSPC32_TV_SEC
 #endif
 
 	.text
@@ -41,23 +43,11 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday)
 	mr	r9, r3			/* datapage ptr in r9 */
 	cmplwi	r10,0			/* check if tv is NULL */
 	beq	3f
-	bl	__do_get_xsec@local	/* get xsec from tb & kernel */
-	bne-	2f			/* out of line -> do syscall */
-
-	/* seconds are xsec >> 20 */
-	rlwinm	r5,r4,12,20,31
-	rlwimi	r5,r3,12,0,19
-	stw	r5,TVAL32_TV_SEC(r10)
-
-	/* get remaining xsec and convert to usec. we scale
-	 * up remaining xsec by 12 bits and get the top 32 bits
-	 * of the multiplication
-	 */
-	rlwinm	r5,r4,12,0,19
-	lis	r6,1000000@h
-	ori	r6,r6,1000000@l
-	mulhwu	r5,r5,r6
-	stw	r5,TVAL32_TV_USEC(r10)
+	lis	r7,1000000@ha		/* load up USEC_PER_SEC */
+	addi	r7,r7,1000000@l		/* so we get microseconds in r4 */
+	bl	__do_get_tspec@local	/* get sec/usec from tb & kernel */
+	stw	r3,TVAL32_TV_SEC(r10)
+	stw	r4,TVAL32_TV_USEC(r10)
 
 3:	cmplwi	r11,0			/* check if tz is NULL */
 	beq	1f
@@ -70,14 +60,6 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday)
 	crclr	cr0*4+so
 	li	r3,0
 	blr
-
-2:
-	mtlr	r12
-	mr	r3,r10
-	mr	r4,r11
-	li	r0,__NR_gettimeofday
-	sc
-	blr
   .cfi_endproc
 V_FUNCTION_END(__kernel_gettimeofday)
 
@@ -100,7 +82,8 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime)
 	mr	r11,r4			/* r11 saves tp */
 	bl	__get_datapage@local	/* get data page */
 	mr	r9,r3			/* datapage ptr in r9 */
-
+	lis	r7,NSEC_PER_SEC@h	/* want nanoseconds */
+	ori	r7,r7,NSEC_PER_SEC@l
 50:	bl	__do_get_tspec@local	/* get sec/nsec from tb & kernel */
 	bne	cr1,80f			/* not monotonic -> all done */
 
@@ -198,83 +181,12 @@ V_FUNCTION_END(__kernel_clock_getres)
 
 
 /*
- * This is the core of gettimeofday() & friends, it returns the xsec
- * value in r3 & r4 and expects the datapage ptr (non clobbered)
- * in r9. clobbers r0,r4,r5,r6,r7,r8.
- * When returning, r8 contains the counter value that can be reused
- * by the monotonic clock implementation
- */
-__do_get_xsec:
-  .cfi_startproc
-	/* Check for update count & load values. We use the low
-	 * order 32 bits of the update count
-	 */
-1:	lwz	r8,(CFG_TB_UPDATE_COUNT+LOPART)(r9)
-	andi.	r0,r8,1			/* pending update ? loop */
-	bne-	1b
-	xor	r0,r8,r8		/* create dependency */
-	add	r9,r9,r0
-
-	/* Load orig stamp (offset to TB) */
-	lwz	r5,CFG_TB_ORIG_STAMP(r9)
-	lwz	r6,(CFG_TB_ORIG_STAMP+4)(r9)
-
-	/* Get a stable TB value */
-2:	mftbu	r3
-	mftbl	r4
-	mftbu	r0
-	cmpl	cr0,r3,r0
-	bne-	2b
-
-	/* Substract tb orig stamp. If the high part is non-zero, we jump to
-	 * the slow path which call the syscall.
-	 * If it's ok, then we have our 32 bits tb_ticks value in r7
-	 */
-	subfc	r7,r6,r4
-	subfe.	r0,r5,r3
-	bne-	3f
-
-	/* Load scale factor & do multiplication */
-	lwz	r5,CFG_TB_TO_XS(r9)	/* load values */
-	lwz	r6,(CFG_TB_TO_XS+4)(r9)
-	mulhwu	r4,r7,r5
-	mulhwu	r6,r7,r6
-	mullw	r0,r7,r5
-	addc	r6,r6,r0
-
-	/* At this point, we have the scaled xsec value in r4 + XER:CA
-	 * we load & add the stamp since epoch
-	 */
-	lwz	r5,CFG_STAMP_XSEC(r9)
-	lwz	r6,(CFG_STAMP_XSEC+4)(r9)
-	adde	r4,r4,r6
-	addze	r3,r5
-
-	/* We now have our result in r3,r4. We create a fake dependency
-	 * on that result and re-check the counter
-	 */
-	or	r6,r4,r3
-	xor	r0,r6,r6
-	add	r9,r9,r0
-	lwz	r0,(CFG_TB_UPDATE_COUNT+LOPART)(r9)
-        cmpl    cr0,r8,r0		/* check if updated */
-	bne-	1b
-
-	/* Warning ! The caller expects CR:EQ to be set to indicate a
-	 * successful calculation (so it won't fallback to the syscall
-	 * method). We have overriden that CR bit in the counter check,
-	 * but fortunately, the loop exit condition _is_ CR:EQ set, so
-	 * we can exit safely here. If you change this code, be careful
-	 * of that side effect.
-	 */
-3:	blr
-  .cfi_endproc
-
-/*
- * This is the core of clock_gettime(), it returns the current
- * time in seconds and nanoseconds in r3 and r4.
+ * This is the core of clock_gettime() and gettimeofday(),
+ * it returns the current time in r3 (seconds) and r4.
+ * On entry, r7 gives the resolution of r4, either USEC_PER_SEC
+ * or NSEC_PER_SEC, giving r4 in microseconds or nanoseconds.
  * It expects the datapage ptr in r9 and doesn't clobber it.
- * It clobbers r0, r5, r6, r10 and returns NSEC_PER_SEC in r7.
+ * It clobbers r0, r5 and r6.
  * On return, r8 contains the counter value that can be reused.
  * This clobbers cr0 but not any other cr field.
  */
@@ -297,70 +209,58 @@ __do_get_tspec:
 2:	mftbu	r3
 	mftbl	r4
 	mftbu	r0
-	cmpl	cr0,r3,r0
+	cmplw	cr0,r3,r0
 	bne-	2b
 
 	/* Subtract tb orig stamp and shift left 12 bits.
 	 */
-	subfc	r7,r6,r4
+	subfc	r4,r6,r4
 	subfe	r0,r5,r3
 	slwi	r0,r0,12
-	rlwimi.	r0,r7,12,20,31
-	slwi	r7,r7,12
+	rlwimi.	r0,r4,12,20,31
+	slwi	r4,r4,12
 
-	/* Load scale factor & do multiplication */
+	/*
+	 * Load scale factor & do multiplication.
+	 * We only use the high 32 bits of the tb_to_xs value.
+	 * Even with a 1GHz timebase clock, the high 32 bits of
+	 * tb_to_xs will be at least 4 million, so the error from
+	 * ignoring the low 32 bits will be no more than 0.25ppm.
+	 * The error will just make the clock run very very slightly
+	 * slow until the next time the kernel updates the VDSO data,
+	 * at which point the clock will catch up to the kernel's value,
+	 * so there is no long-term error accumulation.
+	 */
 	lwz	r5,CFG_TB_TO_XS(r9)	/* load values */
-	lwz	r6,(CFG_TB_TO_XS+4)(r9)
-	mulhwu	r3,r7,r6
-	mullw	r10,r7,r5
-	mulhwu	r4,r7,r5
-	addc	r10,r3,r10
+	mulhwu	r4,r4,r5
 	li	r3,0
 
 	beq+	4f			/* skip high part computation if 0 */
 	mulhwu	r3,r0,r5
-	mullw	r7,r0,r5
-	mulhwu	r5,r0,r6
-	mullw	r6,r0,r6
-	adde	r4,r4,r7
-	addze	r3,r3
+	mullw	r5,r0,r5
 	addc	r4,r4,r5
 	addze	r3,r3
-	addc	r10,r10,r6
-
-4:	addze	r4,r4			/* add in carry */
-	lis	r7,NSEC_PER_SEC@h
-	ori	r7,r7,NSEC_PER_SEC@l
-	mulhwu	r4,r4,r7		/* convert to nanoseconds */
-
-	/* At this point, we have seconds & nanoseconds since the xtime
-	 * stamp in r3+CA and r4.  Load & add the xtime stamp.
+4:
+	/* At this point, we have seconds since the xtime stamp
+	 * as a 32.32 fixed-point number in r3 and r4.
+	 * Load & add the xtime stamp.
 	 */
-#ifdef CONFIG_PPC64
-	lwz	r5,STAMP_XTIME+TSPC64_TV_SEC+LOPART(r9)
-	lwz	r6,STAMP_XTIME+TSPC64_TV_NSEC+LOPART(r9)
-#else
-	lwz	r5,STAMP_XTIME+TSPC32_TV_SEC(r9)
-	lwz	r6,STAMP_XTIME+TSPC32_TV_NSEC(r9)
-#endif
-	add	r4,r4,r6
+	lwz	r5,STAMP_XTIME+TSPEC_TV_SEC(r9)
+	lwz	r6,STAMP_SEC_FRAC(r9)
+	addc	r4,r4,r6
 	adde	r3,r3,r5
 
-	/* We now have our result in r3,r4. We create a fake dependency
-	 * on that result and re-check the counter
+	/* We create a fake dependency on the result in r3/r4
+	 * and re-check the counter
 	 */
 	or	r6,r4,r3
 	xor	r0,r6,r6
 	add	r9,r9,r0
 	lwz	r0,(CFG_TB_UPDATE_COUNT+LOPART)(r9)
-        cmpl    cr0,r8,r0		/* check if updated */
+        cmplw	cr0,r8,r0		/* check if updated */
 	bne-	1b
 
-	/* check for nanosecond overflow and adjust if necessary */
-	cmpw	r4,r7
-	bltlr				/* all done if no overflow */
-	subf	r4,r7,r4		/* adjust if overflow */
-	addi	r3,r3,1
+	mulhwu	r4,r4,r7		/* convert to micro or nanoseconds */
 
 	blr
   .cfi_endproc
diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S
index 262cd5857a56..e97a9a0dc4ac 100644
--- a/arch/powerpc/kernel/vdso64/gettimeofday.S
+++ b/arch/powerpc/kernel/vdso64/gettimeofday.S
@@ -33,18 +33,11 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday)
 	bl	V_LOCAL_FUNC(__get_datapage)	/* get data page */
 	cmpldi	r11,0			/* check if tv is NULL */
 	beq	2f
-	bl	V_LOCAL_FUNC(__do_get_xsec)	/* get xsec from tb & kernel */
-	lis     r7,15			/* r7 = 1000000 = USEC_PER_SEC */
-	ori     r7,r7,16960
-	rldicl  r5,r4,44,20		/* r5 = sec = xsec / XSEC_PER_SEC */
-	rldicr  r6,r5,20,43		/* r6 = sec * XSEC_PER_SEC */
-	std	r5,TVAL64_TV_SEC(r11)	/* store sec in tv */
-	subf	r0,r6,r4		/* r0 = xsec = (xsec - r6) */
-	mulld   r0,r0,r7		/* usec = (xsec * USEC_PER_SEC) /
-					 * XSEC_PER_SEC
-					 */
-	rldicl  r0,r0,44,20
-	std	r0,TVAL64_TV_USEC(r11)	/* store usec in tv */
+	lis	r7,1000000@ha		/* load up USEC_PER_SEC */
+	addi	r7,r7,1000000@l
+	bl	V_LOCAL_FUNC(__do_get_tspec) /* get sec/us from tb & kernel */
+	std	r4,TVAL64_TV_SEC(r11)	/* store sec in tv */
+	std	r5,TVAL64_TV_USEC(r11)	/* store usec in tv */
 2:	cmpldi	r10,0			/* check if tz is NULL */
 	beq	1f
 	lwz	r4,CFG_TZ_MINUTEWEST(r3)/* fill tz */
@@ -77,6 +70,8 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime)
   .cfi_register lr,r12
 	mr	r11,r4			/* r11 saves tp */
 	bl	V_LOCAL_FUNC(__get_datapage)	/* get data page */
+	lis	r7,NSEC_PER_SEC@h	/* want nanoseconds */
+	ori	r7,r7,NSEC_PER_SEC@l
 50:	bl	V_LOCAL_FUNC(__do_get_tspec)	/* get time from tb & kernel */
 	bne	cr1,80f			/* if not monotonic, all done */
 
@@ -171,49 +166,12 @@ V_FUNCTION_END(__kernel_clock_getres)
 
 
 /*
- * This is the core of gettimeofday(), it returns the xsec
- * value in r4 and expects the datapage ptr (non clobbered)
- * in r3. clobbers r0,r4,r5,r6,r7,r8
- * When returning, r8 contains the counter value that can be reused
- */
-V_FUNCTION_BEGIN(__do_get_xsec)
-  .cfi_startproc
-	/* check for update count & load values */
-1:	ld	r8,CFG_TB_UPDATE_COUNT(r3)
-	andi.	r0,r8,1			/* pending update ? loop */
-	bne-	1b
-	xor	r0,r8,r8		/* create dependency */
-	add	r3,r3,r0
-
-	/* Get TB & offset it. We use the MFTB macro which will generate
-	 * workaround code for Cell.
-	 */
-	MFTB(r7)
-	ld	r9,CFG_TB_ORIG_STAMP(r3)
-	subf	r7,r9,r7
-
-	/* Scale result */
-	ld	r5,CFG_TB_TO_XS(r3)
-	mulhdu	r7,r7,r5
-
-	/* Add stamp since epoch */
-	ld	r6,CFG_STAMP_XSEC(r3)
-	add	r4,r6,r7
-
-	xor	r0,r4,r4
-	add	r3,r3,r0
-	ld	r0,CFG_TB_UPDATE_COUNT(r3)
-        cmpld   cr0,r0,r8		/* check if updated */
-	bne-	1b
-	blr
-  .cfi_endproc
-V_FUNCTION_END(__do_get_xsec)
-
-/*
- * This is the core of clock_gettime(), it returns the current
- * time in seconds and nanoseconds in r4 and r5.
+ * This is the core of clock_gettime() and gettimeofday(),
+ * it returns the current time in r4 (seconds) and r5.
+ * On entry, r7 gives the resolution of r5, either USEC_PER_SEC
+ * or NSEC_PER_SEC, giving r5 in microseconds or nanoseconds.
  * It expects the datapage ptr in r3 and doesn't clobber it.
- * It clobbers r0 and r6 and returns NSEC_PER_SEC in r7.
+ * It clobbers r0, r6 and r9.
  * On return, r8 contains the counter value that can be reused.
  * This clobbers cr0 but not any other cr field.
  */
@@ -229,18 +187,18 @@ V_FUNCTION_BEGIN(__do_get_tspec)
 	/* Get TB & offset it. We use the MFTB macro which will generate
 	 * workaround code for Cell.
 	 */
-	MFTB(r7)
+	MFTB(r6)
 	ld	r9,CFG_TB_ORIG_STAMP(r3)
-	subf	r7,r9,r7
+	subf	r6,r9,r6
 
 	/* Scale result */
 	ld	r5,CFG_TB_TO_XS(r3)
-	sldi	r7,r7,12		/* compute time since stamp_xtime */
-	mulhdu	r6,r7,r5		/* in units of 2^-32 seconds */
+	sldi	r6,r6,12		/* compute time since stamp_xtime */
+	mulhdu	r6,r6,r5		/* in units of 2^-32 seconds */
 
 	/* Add stamp since epoch */
 	ld	r4,STAMP_XTIME+TSPC64_TV_SEC(r3)
-	ld	r5,STAMP_XTIME+TSPC64_TV_NSEC(r3)
+	lwz	r5,STAMP_SEC_FRAC(r3)
 	or	r0,r4,r5
 	or	r0,r0,r6
 	xor	r0,r0,r0
@@ -250,17 +208,11 @@ V_FUNCTION_BEGIN(__do_get_tspec)
 	bne-	1b			/* reload if so */
 
 	/* convert to seconds & nanoseconds and add to stamp */
-	lis	r7,NSEC_PER_SEC@h
-	ori	r7,r7,NSEC_PER_SEC@l
-	mulhwu	r0,r6,r7		/* compute nanoseconds and */
+	add	r6,r6,r5		/* add on fractional seconds of xtime */
+	mulhwu	r5,r6,r7		/* compute micro or nanoseconds and */
 	srdi	r6,r6,32		/* seconds since stamp_xtime */
-	clrldi	r0,r0,32
-	add	r5,r5,r0		/* add nanoseconds together */
-	cmpd	r5,r7			/* overflow? */
+	clrldi	r5,r5,32
 	add	r4,r4,r6
-	bltlr				/* all done if no overflow */
-	subf	r5,r7,r5		/* if overflow, adjust */
-	addi	r4,r4,1
 	blr
   .cfi_endproc
 V_FUNCTION_END(__do_get_tspec)
-- 
cgit v1.2.3


From c1aa687d499a8bce55cb8cf962f0b72c0f933f14 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Sun, 20 Jun 2010 19:04:14 +0000
Subject: powerpc: Clean up obsolete code relating to decrementer and timebase

Since the decrementer and timekeeping code was moved over to using
the generic clockevents and timekeeping infrastructure, several
variables and functions have been obsolete and effectively unused.
This deletes them.

In particular, wakeup_decrementer() is no longer needed since the
generic code reprograms the decrementer as part of the process of
resuming the timekeeping code, which happens during sysdev resume.
Thus the wakeup_decrementer calls in the suspend_enter methods for
52xx platforms have been removed.  The call in the powermac cpu
frequency change code has been replaced by set_dec(1), which will
cause a timer interrupt as soon as interrupts are enabled, and the
generic code will then reprogram the decrementer with the correct
value.

This also simplifies the generic_suspend_en/disable_irqs functions
and makes them static since they are not referenced outside time.c.
The preempt_enable/disable calls are removed because the generic
code has disabled all but the boot cpu at the point where these
functions are called, so we can't be moved to another cpu.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/machdep.h           |   3 -
 arch/powerpc/include/asm/time.h              |   7 --
 arch/powerpc/kernel/smp.c                    |   2 -
 arch/powerpc/kernel/time.c                   | 136 +--------------------------
 arch/powerpc/platforms/52xx/lite5200_pm.c    |   3 -
 arch/powerpc/platforms/52xx/mpc52xx_pm.c     |   3 -
 arch/powerpc/platforms/powermac/cpufreq_32.c |   8 +-
 7 files changed, 9 insertions(+), 153 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 9f0fc9e6ce0d..7716e68f7b18 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -366,8 +366,5 @@ static inline void log_error(char *buf, unsigned int err_type, int fatal)
 #define machine_late_initcall(mach,fn)		__define_machine_initcall(mach,"7",fn,7)
 #define machine_late_initcall_sync(mach,fn)	__define_machine_initcall(mach,"7s",fn,7s)
 
-void generic_suspend_disable_irqs(void);
-void generic_suspend_enable_irqs(void);
-
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_MACHDEP_H */
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 27ccb764fdab..dc779dfcf258 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -28,16 +28,12 @@
 extern unsigned long tb_ticks_per_jiffy;
 extern unsigned long tb_ticks_per_usec;
 extern unsigned long tb_ticks_per_sec;
-extern u64 tb_to_xs;
-extern unsigned      tb_to_us;
 
 struct rtc_time;
 extern void to_tm(int tim, struct rtc_time * tm);
 extern void GregorianDay(struct rtc_time *tm);
-extern time_t last_rtc_update;
 
 extern void generic_calibrate_decr(void);
-extern void wakeup_decrementer(void);
 extern void snapshot_timebase(void);
 
 extern void set_dec_cpu6(unsigned int val);
@@ -204,9 +200,6 @@ static inline unsigned long tb_ticks_since(unsigned long tstamp)
 extern u64 mulhdu(u64, u64);
 #endif
 
-extern void smp_space_timers(unsigned int);
-
-extern unsigned mulhwu_scale_factor(unsigned, unsigned);
 extern void div128_by_32(u64 dividend_high, u64 dividend_low,
 			 unsigned divisor, struct div_result *dr);
 
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 5c196d1086d9..8764daad309b 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -288,8 +288,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 			max_cpus = NR_CPUS;
 	else
 		max_cpus = 1;
- 
-	smp_space_timers(max_cpus);
 
 	for_each_possible_cpu(cpu)
 		if (cpu != boot_cpuid)
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 5adebaf47f13..ccb8759c8532 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -149,16 +149,6 @@ unsigned long tb_ticks_per_usec = 100; /* sane default */
 EXPORT_SYMBOL(tb_ticks_per_usec);
 unsigned long tb_ticks_per_sec;
 EXPORT_SYMBOL(tb_ticks_per_sec);	/* for cputime_t conversions */
-u64 tb_to_xs;
-unsigned tb_to_us;
-
-#define TICKLEN_SCALE	NTP_SCALE_SHIFT
-static u64 last_tick_len;	/* units are ns / 2^TICKLEN_SCALE */
-static u64 ticklen_to_xs;	/* 0.64 fraction */
-
-/* If last_tick_len corresponds to about 1/HZ seconds, then
-   last_tick_len << TICKLEN_SHIFT will be about 2^63. */
-#define TICKLEN_SHIFT	(63 - 30 - TICKLEN_SCALE + SHIFT_HZ)
 
 DEFINE_SPINLOCK(rtc_lock);
 EXPORT_SYMBOL_GPL(rtc_lock);
@@ -174,7 +164,6 @@ unsigned long ppc_proc_freq;
 EXPORT_SYMBOL(ppc_proc_freq);
 unsigned long ppc_tb_freq;
 
-static u64 tb_last_jiffy __cacheline_aligned_in_smp;
 static DEFINE_PER_CPU(u64, last_jiffy);
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
@@ -446,7 +435,6 @@ EXPORT_SYMBOL(profile_pc);
 
 static int __init iSeries_tb_recal(void)
 {
-	struct div_result divres;
 	unsigned long titan, tb;
 
 	/* Make sure we only run on iSeries */
@@ -477,10 +465,7 @@ static int __init iSeries_tb_recal(void)
 				tb_ticks_per_jiffy = new_tb_ticks_per_jiffy;
 				tb_ticks_per_sec   = new_tb_ticks_per_sec;
 				calc_cputime_factors();
-				div128_by_32( XSEC_PER_SEC, 0, tb_ticks_per_sec, &divres );
-				tb_to_xs = divres.result_low;
 				vdso_data->tb_ticks_per_sec = tb_ticks_per_sec;
-				vdso_data->tb_to_xs = tb_to_xs;
 				setup_cputime_one_jiffy();
 			}
 			else {
@@ -643,27 +628,9 @@ void timer_interrupt(struct pt_regs * regs)
 	trace_timer_interrupt_exit(regs);
 }
 
-void wakeup_decrementer(void)
-{
-	unsigned long ticks;
-
-	/*
-	 * The timebase gets saved on sleep and restored on wakeup,
-	 * so all we need to do is to reset the decrementer.
-	 */
-	ticks = tb_ticks_since(__get_cpu_var(last_jiffy));
-	if (ticks < tb_ticks_per_jiffy)
-		ticks = tb_ticks_per_jiffy - ticks;
-	else
-		ticks = 1;
-	set_dec(ticks);
-}
-
 #ifdef CONFIG_SUSPEND
-void generic_suspend_disable_irqs(void)
+static void generic_suspend_disable_irqs(void)
 {
-	preempt_disable();
-
 	/* Disable the decrementer, so that it doesn't interfere
 	 * with suspending.
 	 */
@@ -673,12 +640,9 @@ void generic_suspend_disable_irqs(void)
 	set_dec(0x7fffffff);
 }
 
-void generic_suspend_enable_irqs(void)
+static void generic_suspend_enable_irqs(void)
 {
-	wakeup_decrementer();
-
 	local_irq_enable();
-	preempt_enable();
 }
 
 /* Overrides the weak version in kernel/power/main.c */
@@ -698,23 +662,6 @@ void arch_suspend_enable_irqs(void)
 }
 #endif
 
-#ifdef CONFIG_SMP
-void __init smp_space_timers(unsigned int max_cpus)
-{
-	int i;
-	u64 previous_tb = per_cpu(last_jiffy, boot_cpuid);
-
-	/* make sure tb > per_cpu(last_jiffy, cpu) for all cpus always */
-	previous_tb -= tb_ticks_per_jiffy;
-
-	for_each_possible_cpu(i) {
-		if (i == boot_cpuid)
-			continue;
-		per_cpu(last_jiffy, i) = previous_tb;
-	}
-}
-#endif
-
 /*
  * Scheduler clock - returns current time in nanosec units.
  *
@@ -1014,15 +961,13 @@ void secondary_cpu_time_init(void)
 /* This function is only called on the boot processor */
 void __init time_init(void)
 {
-	unsigned long flags;
 	struct div_result res;
-	u64 scale, x;
+	u64 scale;
 	unsigned shift;
 
 	if (__USE_RTC()) {
 		/* 601 processor: dec counts down by 128 every 128ns */
 		ppc_tb_freq = 1000000000;
-		tb_last_jiffy = get_rtcl();
 	} else {
 		/* Normal PowerPC with timebase register */
 		ppc_md.calibrate_decr();
@@ -1030,49 +975,14 @@ void __init time_init(void)
 		       ppc_tb_freq / 1000000, ppc_tb_freq % 1000000);
 		printk(KERN_DEBUG "time_init: processor frequency   = %lu.%.6lu MHz\n",
 		       ppc_proc_freq / 1000000, ppc_proc_freq % 1000000);
-		tb_last_jiffy = get_tb();
 	}
 
 	tb_ticks_per_jiffy = ppc_tb_freq / HZ;
 	tb_ticks_per_sec = ppc_tb_freq;
 	tb_ticks_per_usec = ppc_tb_freq / 1000000;
-	tb_to_us = mulhwu_scale_factor(ppc_tb_freq, 1000000);
 	calc_cputime_factors();
 	setup_cputime_one_jiffy();
 
-	/*
-	 * Calculate the length of each tick in ns.  It will not be
-	 * exactly 1e9/HZ unless ppc_tb_freq is divisible by HZ.
-	 * We compute 1e9 * tb_ticks_per_jiffy / ppc_tb_freq,
-	 * rounded up.
-	 */
-	x = (u64) NSEC_PER_SEC * tb_ticks_per_jiffy + ppc_tb_freq - 1;
-	do_div(x, ppc_tb_freq);
-	tick_nsec = x;
-	last_tick_len = x << TICKLEN_SCALE;
-
-	/*
-	 * Compute ticklen_to_xs, which is a factor which gets multiplied
-	 * by (last_tick_len << TICKLEN_SHIFT) to get a tb_to_xs value.
-	 * It is computed as:
-	 * ticklen_to_xs = 2^N / (tb_ticks_per_jiffy * 1e9)
-	 * where N = 64 + 20 - TICKLEN_SCALE - TICKLEN_SHIFT
-	 * which turns out to be N = 51 - SHIFT_HZ.
-	 * This gives the result as a 0.64 fixed-point fraction.
-	 * That value is reduced by an offset amounting to 1 xsec per
-	 * 2^31 timebase ticks to avoid problems with time going backwards
-	 * by 1 xsec when we do timer_recalc_offset due to losing the
-	 * fractional xsec.  That offset is equal to ppc_tb_freq/2^51
-	 * since there are 2^20 xsec in a second.
-	 */
-	div128_by_32((1ULL << 51) - ppc_tb_freq, 0,
-		     tb_ticks_per_jiffy << SHIFT_HZ, &res);
-	div128_by_32(res.result_high, res.result_low, NSEC_PER_SEC, &res);
-	ticklen_to_xs = res.result_low;
-
-	/* Compute tb_to_xs from tick_nsec */
-	tb_to_xs = mulhdu(last_tick_len << TICKLEN_SHIFT, ticklen_to_xs);
-
 	/*
 	 * Compute scale factor for sched_clock.
 	 * The calibrate_decr() function has set tb_ticks_per_sec,
@@ -1094,21 +1004,14 @@ void __init time_init(void)
 	/* Save the current timebase to pretty up CONFIG_PRINTK_TIME */
 	boot_tb = get_tb_or_rtc();
 
-	write_seqlock_irqsave(&xtime_lock, flags);
-
 	/* If platform provided a timezone (pmac), we correct the time */
         if (timezone_offset) {
 		sys_tz.tz_minuteswest = -timezone_offset / 60;
 		sys_tz.tz_dsttime = 0;
         }
 
-	vdso_data->tb_orig_stamp = tb_last_jiffy;
 	vdso_data->tb_update_count = 0;
 	vdso_data->tb_ticks_per_sec = tb_ticks_per_sec;
-	vdso_data->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC;
-	vdso_data->tb_to_xs = tb_to_xs;
-
-	write_sequnlock_irqrestore(&xtime_lock, flags);
 
 	/* Start the decrementer on CPUs that have manual control
 	 * such as BookE
@@ -1202,39 +1105,6 @@ void to_tm(int tim, struct rtc_time * tm)
 	GregorianDay(tm);
 }
 
-/* Auxiliary function to compute scaling factors */
-/* Actually the choice of a timebase running at 1/4 the of the bus
- * frequency giving resolution of a few tens of nanoseconds is quite nice.
- * It makes this computation very precise (27-28 bits typically) which
- * is optimistic considering the stability of most processor clock
- * oscillators and the precision with which the timebase frequency
- * is measured but does not harm.
- */
-unsigned mulhwu_scale_factor(unsigned inscale, unsigned outscale)
-{
-        unsigned mlt=0, tmp, err;
-        /* No concern for performance, it's done once: use a stupid
-         * but safe and compact method to find the multiplier.
-         */
-  
-        for (tmp = 1U<<31; tmp != 0; tmp >>= 1) {
-                if (mulhwu(inscale, mlt|tmp) < outscale)
-			mlt |= tmp;
-        }
-  
-        /* We might still be off by 1 for the best approximation.
-         * A side effect of this is that if outscale is too large
-         * the returned value will be zero.
-         * Many corner cases have been checked and seem to work,
-         * some might have been forgotten in the test however.
-         */
-  
-        err = inscale * (mlt+1);
-        if (err <= inscale/2)
-		mlt++;
-        return mlt;
-}
-
 /*
  * Divide a 128-bit dividend by a 32-bit divisor, leaving a 128 bit
  * result.
diff --git a/arch/powerpc/platforms/52xx/lite5200_pm.c b/arch/powerpc/platforms/52xx/lite5200_pm.c
index b5c753db125e..80234e5921f5 100644
--- a/arch/powerpc/platforms/52xx/lite5200_pm.c
+++ b/arch/powerpc/platforms/52xx/lite5200_pm.c
@@ -216,9 +216,6 @@ static int lite5200_pm_enter(suspend_state_t state)
 
 	lite5200_restore_regs();
 
-	/* restart jiffies */
-	wakeup_decrementer();
-
 	iounmap(mbar);
 	return 0;
 }
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_pm.c b/arch/powerpc/platforms/52xx/mpc52xx_pm.c
index 76722532bd95..568cef636275 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_pm.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_pm.c
@@ -171,9 +171,6 @@ int mpc52xx_pm_enter(suspend_state_t state)
 	/* restore SRAM */
 	memcpy(sram, saved_sram, sram_size);
 
-	/* restart jiffies */
-	wakeup_decrementer();
-
 	/* reenable interrupts in PIC */
 	out_be32(&intr->main_mask, intr_main_mask);
 
diff --git a/arch/powerpc/platforms/powermac/cpufreq_32.c b/arch/powerpc/platforms/powermac/cpufreq_32.c
index 1e9eba175ff0..415ca6d6b273 100644
--- a/arch/powerpc/platforms/powermac/cpufreq_32.c
+++ b/arch/powerpc/platforms/powermac/cpufreq_32.c
@@ -310,8 +310,12 @@ static int pmu_set_cpu_speed(int low_speed)
 	/* Restore low level PMU operations */
 	pmu_unlock();
 
-	/* Restore decrementer */
-	wakeup_decrementer();
+	/*
+	 * Restore decrementer; we'll take a decrementer interrupt
+	 * as soon as interrupts are re-enabled and the generic
+	 * clockevents code will reprogram it with the right value.
+	 */
+	set_dec(1);
 
 	/* Restore interrupts */
  	mpic_cpu_set_priority(pic_prio);
-- 
cgit v1.2.3


From 8fe93f8d850a24581e9d47df5814b257fe451052 Mon Sep 17 00:00:00 2001
From: Brian King <brking@linux.vnet.ibm.com>
Date: Wed, 7 Jul 2010 12:31:01 +0000
Subject: powerpc/pseries: Migration code reorganization / hibernation prep

Partition hibernation will use some of the same code as is
currently used for Live Partition Migration. This function
further abstracts this code such that code outside of rtas.c
can utilize it. It also changes the error field in the suspend
me data structure to be an atomic type, since it is set and
checked on different cpus without any barriers or locking.

Signed-off-by: Brian King <brking@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/hvcall.h |   1 +
 arch/powerpc/include/asm/rtas.h   |  10 ++++
 arch/powerpc/kernel/rtas.c        | 105 +++++++++++++++++++++++++-------------
 3 files changed, 81 insertions(+), 35 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 5119b7db3142..de03ca58db5d 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -74,6 +74,7 @@
 #define H_NOT_ENOUGH_RESOURCES -44
 #define H_R_STATE       -45
 #define H_RESCINDEND    -46
+#define H_MULTI_THREADS_ACTIVE -9005
 
 
 /* Long Busy is a condition that can be returned by the firmware
diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 20de73c36682..8941c54b30ea 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -63,6 +63,14 @@ struct rtas_t {
 	struct device_node *dev;	/* virtual address pointer */
 };
 
+struct rtas_suspend_me_data {
+	atomic_t working; /* number of cpus accessing this struct */
+	atomic_t done;
+	int token; /* ibm,suspend-me */
+	atomic_t error;
+	struct completion *complete; /* wait on this until working == 0 */
+};
+
 /* RTAS event classes */
 #define RTAS_INTERNAL_ERROR		0x80000000 /* set bit 0 */
 #define RTAS_EPOW_WARNING		0x40000000 /* set bit 1 */
@@ -174,6 +182,8 @@ extern int rtas_set_indicator(int indicator, int index, int new_value);
 extern int rtas_set_indicator_fast(int indicator, int index, int new_value);
 extern void rtas_progress(char *s, unsigned short hex);
 extern void rtas_initialize(void);
+extern int rtas_suspend_cpu(struct rtas_suspend_me_data *data);
+extern int rtas_suspend_last_cpu(struct rtas_suspend_me_data *data);
 
 struct rtc_time;
 extern unsigned long rtas_get_boot_time(void);
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 0e1ec6f746f6..41cfde0ca11b 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -47,14 +47,6 @@ struct rtas_t rtas = {
 };
 EXPORT_SYMBOL(rtas);
 
-struct rtas_suspend_me_data {
-	atomic_t working; /* number of cpus accessing this struct */
-	atomic_t done;
-	int token; /* ibm,suspend-me */
-	int error;
-	struct completion *complete; /* wait on this until working == 0 */
-};
-
 DEFINE_SPINLOCK(rtas_data_buf_lock);
 EXPORT_SYMBOL(rtas_data_buf_lock);
 
@@ -714,14 +706,53 @@ void rtas_os_term(char *str)
 
 static int ibm_suspend_me_token = RTAS_UNKNOWN_SERVICE;
 #ifdef CONFIG_PPC_PSERIES
-static void rtas_percpu_suspend_me(void *info)
+static int __rtas_suspend_last_cpu(struct rtas_suspend_me_data *data, int wake_when_done)
+{
+	u16 slb_size = mmu_slb_size;
+	int rc = H_MULTI_THREADS_ACTIVE;
+	int cpu;
+
+	slb_set_size(SLB_MIN_SIZE);
+	printk(KERN_DEBUG "calling ibm,suspend-me on cpu %i\n", smp_processor_id());
+
+	while (rc == H_MULTI_THREADS_ACTIVE && !atomic_read(&data->done) &&
+	       !atomic_read(&data->error))
+		rc = rtas_call(data->token, 0, 1, NULL);
+
+	if (rc || atomic_read(&data->error)) {
+		printk(KERN_DEBUG "ibm,suspend-me returned %d\n", rc);
+		slb_set_size(slb_size);
+	}
+
+	if (atomic_read(&data->error))
+		rc = atomic_read(&data->error);
+
+	atomic_set(&data->error, rc);
+
+	if (wake_when_done) {
+		atomic_set(&data->done, 1);
+
+		for_each_online_cpu(cpu)
+			plpar_hcall_norets(H_PROD, get_hard_smp_processor_id(cpu));
+	}
+
+	if (atomic_dec_return(&data->working) == 0)
+		complete(data->complete);
+
+	return rc;
+}
+
+int rtas_suspend_last_cpu(struct rtas_suspend_me_data *data)
+{
+	atomic_inc(&data->working);
+	return __rtas_suspend_last_cpu(data, 0);
+}
+
+static int __rtas_suspend_cpu(struct rtas_suspend_me_data *data, int wake_when_done)
 {
 	long rc = H_SUCCESS;
 	unsigned long msr_save;
-	u16 slb_size = mmu_slb_size;
 	int cpu;
-	struct rtas_suspend_me_data *data =
-		(struct rtas_suspend_me_data *)info;
 
 	atomic_inc(&data->working);
 
@@ -729,7 +760,7 @@ static void rtas_percpu_suspend_me(void *info)
 	msr_save = mfmsr();
 	mtmsr(msr_save & ~(MSR_EE));
 
-	while (rc == H_SUCCESS && !atomic_read(&data->done))
+	while (rc == H_SUCCESS && !atomic_read(&data->done) && !atomic_read(&data->error))
 		rc = plpar_hcall_norets(H_JOIN);
 
 	mtmsr(msr_save);
@@ -741,33 +772,37 @@ static void rtas_percpu_suspend_me(void *info)
 		/* All other cpus are in H_JOIN, this cpu does
 		 * the suspend.
 		 */
-		slb_set_size(SLB_MIN_SIZE);
-		printk(KERN_DEBUG "calling ibm,suspend-me on cpu %i\n",
-		       smp_processor_id());
-		data->error = rtas_call(data->token, 0, 1, NULL);
-
-		if (data->error) {
-			printk(KERN_DEBUG "ibm,suspend-me returned %d\n",
-			       data->error);
-			slb_set_size(slb_size);
-		}
+		return __rtas_suspend_last_cpu(data, wake_when_done);
 	} else {
 		printk(KERN_ERR "H_JOIN on cpu %i failed with rc = %ld\n",
 		       smp_processor_id(), rc);
-		data->error = rc;
+		atomic_set(&data->error, rc);
 	}
 
-	atomic_set(&data->done, 1);
+	if (wake_when_done) {
+		atomic_set(&data->done, 1);
 
-	/* This cpu did the suspend or got an error; in either case,
-	 * we need to prod all other other cpus out of join state.
-	 * Extra prods are harmless.
-	 */
-	for_each_online_cpu(cpu)
-		plpar_hcall_norets(H_PROD, get_hard_smp_processor_id(cpu));
+		/* This cpu did the suspend or got an error; in either case,
+		 * we need to prod all other other cpus out of join state.
+		 * Extra prods are harmless.
+		 */
+		for_each_online_cpu(cpu)
+			plpar_hcall_norets(H_PROD, get_hard_smp_processor_id(cpu));
+	}
 out:
 	if (atomic_dec_return(&data->working) == 0)
 		complete(data->complete);
+	return rc;
+}
+
+int rtas_suspend_cpu(struct rtas_suspend_me_data *data)
+{
+	return __rtas_suspend_cpu(data, 0);
+}
+
+static void rtas_percpu_suspend_me(void *info)
+{
+	__rtas_suspend_cpu((struct rtas_suspend_me_data *)info, 1);
 }
 
 static int rtas_ibm_suspend_me(struct rtas_args *args)
@@ -802,22 +837,22 @@ static int rtas_ibm_suspend_me(struct rtas_args *args)
 
 	atomic_set(&data.working, 0);
 	atomic_set(&data.done, 0);
+	atomic_set(&data.error, 0);
 	data.token = rtas_token("ibm,suspend-me");
-	data.error = 0;
 	data.complete = &done;
 
 	/* Call function on all CPUs.  One of us will make the
 	 * rtas call
 	 */
 	if (on_each_cpu(rtas_percpu_suspend_me, &data, 0))
-		data.error = -EINVAL;
+		atomic_set(&data.error, -EINVAL);
 
 	wait_for_completion(&done);
 
-	if (data.error != 0)
+	if (atomic_read(&data.error) != 0)
 		printk(KERN_ERR "Error doing global join\n");
 
-	return data.error;
+	return atomic_read(&data.error);
 }
 #else /* CONFIG_PPC_PSERIES */
 static int rtas_ibm_suspend_me(struct rtas_args *args)
-- 
cgit v1.2.3


From 32d8ad4e621d6620e925cf540ef1d35aa6fa5a7b Mon Sep 17 00:00:00 2001
From: Brian King <brking@linux.vnet.ibm.com>
Date: Wed, 7 Jul 2010 12:31:02 +0000
Subject: powerpc/pseries: Partition hibernation support

Enables support for HMC initiated partition hibernation. This is
a firmware assisted hibernation, since the firmware handles writing
the memory out to disk, along with other partition information,
so we just mimic suspend to ram.

Signed-off-by: Brian King <brking@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/Kconfig                         |   2 +-
 arch/powerpc/include/asm/machdep.h           |   1 +
 arch/powerpc/platforms/pseries/Makefile      |   4 +
 arch/powerpc/platforms/pseries/hotplug-cpu.c |   3 +
 arch/powerpc/platforms/pseries/suspend.c     | 214 +++++++++++++++++++++++++++
 5 files changed, 223 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/platforms/pseries/suspend.c

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 89bc2e895390..25b28fdc6767 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -219,7 +219,7 @@ config ARCH_HIBERNATION_POSSIBLE
 config ARCH_SUSPEND_POSSIBLE
 	def_bool y
 	depends on ADB_PMU || PPC_EFIKA || PPC_LITE5200 || PPC_83xx || \
-		   PPC_85xx || PPC_86xx
+		   PPC_85xx || PPC_86xx || PPC_PSERIES
 
 config PPC_DCR_NATIVE
 	bool
diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 7716e68f7b18..2bad6e5855ad 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -266,6 +266,7 @@ struct machdep_calls {
 	void (*suspend_disable_irqs)(void);
 	void (*suspend_enable_irqs)(void);
 #endif
+	int (*suspend_disable_cpu)(void);
 
 #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
 	ssize_t (*cpu_probe)(const char *, size_t);
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index 3dbef309bc8d..046ace9c4381 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -26,3 +26,7 @@ obj-$(CONFIG_HCALL_STATS)	+= hvCall_inst.o
 obj-$(CONFIG_PHYP_DUMP)	+= phyp_dump.o
 obj-$(CONFIG_CMM)		+= cmm.o
 obj-$(CONFIG_DTL)		+= dtl.o
+
+ifeq ($(CONFIG_PPC_PSERIES),y)
+obj-$(CONFIG_SUSPEND)		+= suspend.o
+endif
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 8f85f399ab9f..bbe507635364 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -116,6 +116,9 @@ static void pseries_mach_cpu_die(void)
 
 	if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) {
 		set_cpu_current_state(cpu, CPU_STATE_INACTIVE);
+		if (ppc_md.suspend_disable_cpu)
+			ppc_md.suspend_disable_cpu();
+
 		cede_latency_hint = 2;
 
 		get_lppaca()->idle = 1;
diff --git a/arch/powerpc/platforms/pseries/suspend.c b/arch/powerpc/platforms/pseries/suspend.c
new file mode 100644
index 000000000000..ed72098bb4e3
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/suspend.c
@@ -0,0 +1,214 @@
+/*
+  * Copyright (C) 2010 Brian King IBM Corporation
+  *
+  * This program is free software; you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation; either version 2 of the License, or
+  * (at your option) any later version.
+  *
+  * This program is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  * GNU General Public License for more details.
+  *
+  * You should have received a copy of the GNU General Public License
+  * along with this program; if not, write to the Free Software
+  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+  */
+
+#include <linux/delay.h>
+#include <linux/suspend.h>
+#include <asm/firmware.h>
+#include <asm/hvcall.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+#include <asm/rtas.h>
+
+static u64 stream_id;
+static struct sys_device suspend_sysdev;
+static DECLARE_COMPLETION(suspend_work);
+static struct rtas_suspend_me_data suspend_data;
+static atomic_t suspending;
+
+/**
+ * pseries_suspend_begin - First phase of hibernation
+ *
+ * Check to ensure we are in a valid state to hibernate
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int pseries_suspend_begin(suspend_state_t state)
+{
+	long vasi_state, rc;
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+	/* Make sure the state is valid */
+	rc = plpar_hcall(H_VASI_STATE, retbuf, stream_id);
+
+	vasi_state = retbuf[0];
+
+	if (rc) {
+		pr_err("pseries_suspend_begin: vasi_state returned %ld\n",rc);
+		return rc;
+	} else if (vasi_state == H_VASI_ENABLED) {
+		return -EAGAIN;
+	} else if (vasi_state != H_VASI_SUSPENDING) {
+		pr_err("pseries_suspend_begin: vasi_state returned state %ld\n",
+		       vasi_state);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * pseries_suspend_cpu - Suspend a single CPU
+ *
+ * Makes the H_JOIN call to suspend the CPU
+ *
+ **/
+static int pseries_suspend_cpu(void)
+{
+	if (atomic_read(&suspending))
+		return rtas_suspend_cpu(&suspend_data);
+	return 0;
+}
+
+/**
+ * pseries_suspend_enter - Final phase of hibernation
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int pseries_suspend_enter(suspend_state_t state)
+{
+	int rc = rtas_suspend_last_cpu(&suspend_data);
+
+	atomic_set(&suspending, 0);
+	atomic_set(&suspend_data.done, 1);
+	return rc;
+}
+
+/**
+ * pseries_prepare_late - Prepare to suspend all other CPUs
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int pseries_prepare_late(void)
+{
+	atomic_set(&suspending, 1);
+	atomic_set(&suspend_data.working, 0);
+	atomic_set(&suspend_data.done, 0);
+	atomic_set(&suspend_data.error, 0);
+	suspend_data.complete = &suspend_work;
+	INIT_COMPLETION(suspend_work);
+	return 0;
+}
+
+/**
+ * store_hibernate - Initiate partition hibernation
+ * @classdev:	sysdev class struct
+ * @attr:		class device attribute struct
+ * @buf:		buffer
+ * @count:		buffer size
+ *
+ * Write the stream ID received from the HMC to this file
+ * to trigger hibernating the partition
+ *
+ * Return value:
+ * 	number of bytes printed to buffer / other on failure
+ **/
+static ssize_t store_hibernate(struct sysdev_class *classdev,
+			       struct sysdev_class_attribute *attr,
+			       const char *buf, size_t count)
+{
+	int rc;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	stream_id = simple_strtoul(buf, NULL, 16);
+
+	do {
+		rc = pseries_suspend_begin(PM_SUSPEND_MEM);
+		if (rc == -EAGAIN)
+			ssleep(1);
+	} while (rc == -EAGAIN);
+
+	if (!rc)
+		rc = pm_suspend(PM_SUSPEND_MEM);
+
+	stream_id = 0;
+
+	if (!rc)
+		rc = count;
+	return rc;
+}
+
+static SYSDEV_CLASS_ATTR(hibernate, S_IWUSR, NULL, store_hibernate);
+
+static struct sysdev_class suspend_sysdev_class = {
+	.name = "power",
+};
+
+static struct platform_suspend_ops pseries_suspend_ops = {
+	.valid		= suspend_valid_only_mem,
+	.begin		= pseries_suspend_begin,
+	.prepare_late	= pseries_prepare_late,
+	.enter		= pseries_suspend_enter,
+};
+
+/**
+ * pseries_suspend_sysfs_register - Register with sysfs
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int pseries_suspend_sysfs_register(struct sys_device *sysdev)
+{
+	int rc;
+
+	if ((rc = sysdev_class_register(&suspend_sysdev_class)))
+		return rc;
+
+	sysdev->id = 0;
+	sysdev->cls = &suspend_sysdev_class;
+
+	if ((rc = sysdev_class_create_file(&suspend_sysdev_class, &attr_hibernate)))
+		goto class_unregister;
+
+	return 0;
+
+class_unregister:
+	sysdev_class_unregister(&suspend_sysdev_class);
+	return rc;
+}
+
+/**
+ * pseries_suspend_init - initcall for pSeries suspend
+ *
+ * Return value:
+ * 	0 on success / other on failure
+ **/
+static int __init pseries_suspend_init(void)
+{
+	int rc;
+
+	if (!machine_is(pseries) || !firmware_has_feature(FW_FEATURE_LPAR))
+		return 0;
+
+	suspend_data.token = rtas_token("ibm,suspend-me");
+	if (suspend_data.token == RTAS_UNKNOWN_SERVICE)
+		return 0;
+
+	if ((rc = pseries_suspend_sysfs_register(&suspend_sysdev)))
+		return rc;
+
+	ppc_md.suspend_disable_cpu = pseries_suspend_cpu;
+	suspend_set_ops(&pseries_suspend_ops);
+	return 0;
+}
+
+__initcall(pseries_suspend_init);
-- 
cgit v1.2.3


From 51c7fdba40e741dfe18455b5e4240b70c422bf2e Mon Sep 17 00:00:00 2001
From: Denis Kirjanov <dkirjanov@hera.kernel.org>
Date: Wed, 16 Jun 2010 04:34:44 +0000
Subject: powerpc/iseries: Fix constant warning

Fix smatch warning: constant 0x8000000000000000 is so big it is unsigned long

Signed-off-by: Denis Kirjanov <dkirjanov@kernel.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/abs_addr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/abs_addr.h b/arch/powerpc/include/asm/abs_addr.h
index 98324c5a8286..c3bffc3fa1de 100644
--- a/arch/powerpc/include/asm/abs_addr.h
+++ b/arch/powerpc/include/asm/abs_addr.h
@@ -69,7 +69,7 @@ static inline unsigned long phys_to_abs(unsigned long pa)
  * Legacy iSeries Hypervisor calls
  */
 #define iseries_hv_addr(virtaddr)	\
-	(0x8000000000000000 | virt_to_abs(virtaddr))
+	(0x8000000000000000UL | virt_to_abs(virtaddr))
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_ABS_ADDR_H */
-- 
cgit v1.2.3


From ae01f84b93b274e2f215bdf6d0b46435679b5f9a Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Mon, 31 May 2010 18:45:11 +0000
Subject: powerpc: Optimise per cpu accesses on 64bit

Now we dynamically allocate the paca array, it takes an extra load
whenever we want to access another cpu's paca. One place we do that a lot
is per cpu variables. A simple example:

DEFINE_PER_CPU(unsigned long, vara);
unsigned long test4(int cpu)
{
	return per_cpu(vara, cpu);
}

This takes 4 loads, 5 if you include the actual load of the per cpu variable:

    ld r11,-32760(r30)  # load address of paca pointer
    ld r9,-32768(r30)   # load link address of percpu variable
    sldi r3,r29,9       # get offset into paca (each entry is 512 bytes)
    ld r0,0(r11)        # load paca pointer
    add r3,r0,r3        # paca + offset
    ld r11,64(r3)       # load paca[cpu].data_offset

    ldx r3,r9,r11       # load per cpu variable

If we remove the ppc64 specific per_cpu_offset(), we get the generic one
which indexes into a statically allocated array. This removes one load and
one add:

    ld r11,-32760(r30)  # load address of __per_cpu_offset
    ld r9,-32768(r30)   # load link address of percpu variable
    sldi r3,r29,3       # get offset into __per_cpu_offset (each entry 8 bytes)
    ldx r11,r11,r3      # load __per_cpu_offset[cpu]

    ldx r3,r9,r11       # load per cpu variable

Having all the offsets in one array also helps when iterating over a per cpu
variable across a number of cpus, such as in the scheduler. Before we would
need to load one paca cacheline when calculating each per cpu offset. Now we
have 16 (128 / sizeof(long)) per cpu offsets in each cacheline.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/percpu.h | 3 ---
 arch/powerpc/kernel/asm-offsets.c | 1 -
 arch/powerpc/kernel/setup_64.c    | 9 +++++++--
 3 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/percpu.h b/arch/powerpc/include/asm/percpu.h
index f879252b7ea6..2cedefddba37 100644
--- a/arch/powerpc/include/asm/percpu.h
+++ b/arch/powerpc/include/asm/percpu.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_POWERPC_PERCPU_H_
 #define _ASM_POWERPC_PERCPU_H_
 #ifdef __powerpc64__
-#include <linux/compiler.h>
 
 /*
  * Same as asm-generic/percpu.h, except that we store the per cpu offset
@@ -12,9 +11,7 @@
 
 #include <asm/paca.h>
 
-#define __per_cpu_offset(cpu) (paca[cpu].data_offset)
 #define __my_cpu_offset local_paca->data_offset
-#define per_cpu_offset(x) (__per_cpu_offset(x))
 
 #endif /* CONFIG_SMP */
 #endif /* __powerpc64__ */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index acbbac6aaa22..1c0607ddccc0 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -194,7 +194,6 @@ int main(void)
 	DEFINE(PACA_STARTSPURR, offsetof(struct paca_struct, startspurr));
 	DEFINE(PACA_USER_TIME, offsetof(struct paca_struct, user_time));
 	DEFINE(PACA_SYSTEM_TIME, offsetof(struct paca_struct, system_time));
-	DEFINE(PACA_DATA_OFFSET, offsetof(struct paca_struct, data_offset));
 	DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save));
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
 	DEFINE(PACA_KVM_SVCPU, offsetof(struct paca_struct, shadow_vcpu));
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 643dcac40fcb..c352f322dbdd 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -600,6 +600,9 @@ static int pcpu_cpu_distance(unsigned int from, unsigned int to)
 		return REMOTE_DISTANCE;
 }
 
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(__per_cpu_offset);
+
 void __init setup_per_cpu_areas(void)
 {
 	const size_t dyn_size = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
@@ -624,8 +627,10 @@ void __init setup_per_cpu_areas(void)
 		panic("cannot initialize percpu area (err=%d)", rc);
 
 	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
-	for_each_possible_cpu(cpu)
-		paca[cpu].data_offset = delta + pcpu_unit_offsets[cpu];
+	for_each_possible_cpu(cpu) {
+                __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
+		paca[cpu].data_offset = __per_cpu_offset[cpu];
+	}
 }
 #endif
 
-- 
cgit v1.2.3


From b08e281b5ae681abcbc1815c13d4a3eb3df4ff1b Mon Sep 17 00:00:00 2001
From: Mark Nelson <markn@au1.ibm.com>
Date: Wed, 26 May 2010 21:40:39 +0000
Subject: powerpc/pseries: Rename RAS_VECTOR_OFFSET to
 RTAS_VECTOR_EXTERNAL_INTERRUPT and move to rtas.h

The RAS code has a #define, RAS_VECTOR_OFFSET, that's used in the
check-exception RTAS call for the vector offset of the exception.

We'll be using this same vector offset for the upcoming IO Event interrupts
code (0x500) so let's move it to include/asm/rtas.h and call it
RTAS_VECTOR_EXTERNAL_INTERRUPT.

Signed-off-by: Mark Nelson <markn@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/rtas.h      | 3 +++
 arch/powerpc/platforms/pseries/ras.c | 5 ++---
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 8941c54b30ea..3d35f8ae377e 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -145,6 +145,9 @@ struct rtas_suspend_me_data {
 #define RTAS_TYPE_PMGM_CONFIG_CHANGE	0x70
 #define RTAS_TYPE_PMGM_SERVICE_PROC	0x71
 
+/* RTAS check-exception vector offset */
+#define RTAS_VECTOR_EXTERNAL_INTERRUPT	0x500
+
 struct rtas_error_log {
 	unsigned long version:8;		/* Architectural version */
 	unsigned long severity:3;		/* Severity level of error */
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 41a3e9a039ed..a4fc6da87c2e 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -61,7 +61,6 @@ static int ras_check_exception_token;
 
 #define EPOW_SENSOR_TOKEN	9
 #define EPOW_SENSOR_INDEX	0
-#define RAS_VECTOR_OFFSET	0x500
 
 static irqreturn_t ras_epow_interrupt(int irq, void *dev_id);
 static irqreturn_t ras_error_interrupt(int irq, void *dev_id);
@@ -121,7 +120,7 @@ static irqreturn_t ras_epow_interrupt(int irq, void *dev_id)
 	spin_lock(&ras_log_buf_lock);
 
 	status = rtas_call(ras_check_exception_token, 6, 1, NULL,
-			   RAS_VECTOR_OFFSET,
+			   RTAS_VECTOR_EXTERNAL_INTERRUPT,
 			   irq_map[irq].hwirq,
 			   RTAS_EPOW_WARNING | RTAS_POWERMGM_EVENTS,
 			   critical, __pa(&ras_log_buf),
@@ -156,7 +155,7 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id)
 	spin_lock(&ras_log_buf_lock);
 
 	status = rtas_call(ras_check_exception_token, 6, 1, NULL,
-			   RAS_VECTOR_OFFSET,
+			   RTAS_VECTOR_EXTERNAL_INTERRUPT,
 			   irq_map[irq].hwirq,
 			   RTAS_INTERNAL_ERROR, 1 /*Time Critical */,
 			   __pa(&ras_log_buf),
-- 
cgit v1.2.3


From 41eab6f88f24124df89e38067b3766b7bef06ddb Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Sun, 16 May 2010 20:22:31 +0000
Subject: powerpc/numa: Use form 1 affinity to setup node distance

Form 1 affinity allows multiple entries in ibm,associativity-reference-points
which represent affinity domains in decreasing order of importance. The
Linux concept of a node is always the first entry, but using the other
values as an input to node_distance() allows the memory allocator to make
better decisions on which node to go first when local memory has been
exhausted.

We keep things simple and create an array indexed by NUMA node, capped at
4 entries. Each time we lookup an associativity property we initialise
the array which is overkill, but since we should only hit this path during
boot it didn't seem worth adding a per node valid bit.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/topology.h |   3 +
 arch/powerpc/mm/numa.c              | 122 ++++++++++++++++++++++++++----------
 2 files changed, 92 insertions(+), 33 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 32adf7280720..3033c1b30745 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -87,6 +87,9 @@ static inline int pcibus_to_node(struct pci_bus *bus)
 	.balance_interval	= 1,					\
 }
 
+extern int __node_distance(int, int);
+#define node_distance(a, b) __node_distance(a, b)
+
 extern void __init dump_numa_cpu_topology(void);
 
 extern int sysfs_add_device_to_node(struct sys_device *dev, int nid);
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 80d110635d24..f78f19e0a2a4 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -42,6 +42,12 @@ EXPORT_SYMBOL(node_data);
 
 static int min_common_depth;
 static int n_mem_addr_cells, n_mem_size_cells;
+static int form1_affinity;
+
+#define MAX_DISTANCE_REF_POINTS 4
+static int distance_ref_points_depth;
+static const unsigned int *distance_ref_points;
+static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
 
 /*
  * Allocate node_to_cpumask_map based on number of available nodes
@@ -204,6 +210,39 @@ static const u32 *of_get_usable_memory(struct device_node *memory)
 	return prop;
 }
 
+int __node_distance(int a, int b)
+{
+	int i;
+	int distance = LOCAL_DISTANCE;
+
+	if (!form1_affinity)
+		return distance;
+
+	for (i = 0; i < distance_ref_points_depth; i++) {
+		if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
+			break;
+
+		/* Double the distance for each NUMA level */
+		distance *= 2;
+	}
+
+	return distance;
+}
+
+static void initialize_distance_lookup_table(int nid,
+		const unsigned int *associativity)
+{
+	int i;
+
+	if (!form1_affinity)
+		return;
+
+	for (i = 0; i < distance_ref_points_depth; i++) {
+		distance_lookup_table[nid][i] =
+			associativity[distance_ref_points[i]];
+	}
+}
+
 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
  * info is found.
  */
@@ -225,6 +264,10 @@ static int of_node_to_nid_single(struct device_node *device)
 	/* POWER4 LPAR uses 0xffff as invalid node */
 	if (nid == 0xffff || nid >= MAX_NUMNODES)
 		nid = -1;
+
+	if (nid > 0 && tmp[0] >= distance_ref_points_depth)
+		initialize_distance_lookup_table(nid, tmp);
+
 out:
 	return nid;
 }
@@ -251,26 +294,10 @@ int of_node_to_nid(struct device_node *device)
 }
 EXPORT_SYMBOL_GPL(of_node_to_nid);
 
-/*
- * In theory, the "ibm,associativity" property may contain multiple
- * associativity lists because a resource may be multiply connected
- * into the machine.  This resource then has different associativity
- * characteristics relative to its multiple connections.  We ignore
- * this for now.  We also assume that all cpu and memory sets have
- * their distances represented at a common level.  This won't be
- * true for hierarchical NUMA.
- *
- * In any case the ibm,associativity-reference-points should give
- * the correct depth for a normal NUMA system.
- *
- * - Dave Hansen <haveblue@us.ibm.com>
- */
 static int __init find_min_common_depth(void)
 {
-	int depth, index;
-	const unsigned int *ref_points;
+	int depth;
 	struct device_node *rtas_root;
-	unsigned int len;
 	struct device_node *chosen;
 	const char *vec5;
 
@@ -280,18 +307,28 @@ static int __init find_min_common_depth(void)
 		return -1;
 
 	/*
-	 * this property is 2 32-bit integers, each representing a level of
-	 * depth in the associativity nodes.  The first is for an SMP
-	 * configuration (should be all 0's) and the second is for a normal
-	 * NUMA configuration.
+	 * This property is a set of 32-bit integers, each representing
+	 * an index into the ibm,associativity nodes.
+	 *
+	 * With form 0 affinity the first integer is for an SMP configuration
+	 * (should be all 0's) and the second is for a normal NUMA
+	 * configuration. We have only one level of NUMA.
+	 *
+	 * With form 1 affinity the first integer is the most significant
+	 * NUMA boundary and the following are progressively less significant
+	 * boundaries. There can be more than one level of NUMA.
 	 */
-	index = 1;
-	ref_points = of_get_property(rtas_root,
-			"ibm,associativity-reference-points", &len);
+	distance_ref_points = of_get_property(rtas_root,
+					"ibm,associativity-reference-points",
+					&distance_ref_points_depth);
+
+	if (!distance_ref_points) {
+		dbg("NUMA: ibm,associativity-reference-points not found.\n");
+		goto err;
+	}
+
+	distance_ref_points_depth /= sizeof(int);
 
-	/*
-	 * For form 1 affinity information we want the first field
-	 */
 #define VEC5_AFFINITY_BYTE	5
 #define VEC5_AFFINITY		0x80
 	chosen = of_find_node_by_path("/chosen");
@@ -299,19 +336,38 @@ static int __init find_min_common_depth(void)
 		vec5 = of_get_property(chosen, "ibm,architecture-vec-5", NULL);
 		if (vec5 && (vec5[VEC5_AFFINITY_BYTE] & VEC5_AFFINITY)) {
 			dbg("Using form 1 affinity\n");
-			index = 0;
+			form1_affinity = 1;
 		}
 	}
 
-	if ((len >= 2 * sizeof(unsigned int)) && ref_points) {
-		depth = ref_points[index];
+	if (form1_affinity) {
+		depth = distance_ref_points[0];
 	} else {
-		dbg("NUMA: ibm,associativity-reference-points not found.\n");
-		depth = -1;
+		if (distance_ref_points_depth < 2) {
+			printk(KERN_WARNING "NUMA: "
+				"short ibm,associativity-reference-points\n");
+			goto err;
+		}
+
+		depth = distance_ref_points[1];
 	}
-	of_node_put(rtas_root);
 
+	/*
+	 * Warn and cap if the hardware supports more than
+	 * MAX_DISTANCE_REF_POINTS domains.
+	 */
+	if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
+		printk(KERN_WARNING "NUMA: distance array capped at "
+			"%d entries\n", MAX_DISTANCE_REF_POINTS);
+		distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
+	}
+
+	of_node_put(rtas_root);
 	return depth;
+
+err:
+	of_node_put(rtas_root);
+	return -1;
 }
 
 static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
-- 
cgit v1.2.3


From 0866eb99cc4d5951cb0ed6ddfa92d5a3d55216ae Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 9 Jul 2010 15:21:41 +1000
Subject: powerpc/book3e: mtmsr should not be mtmsrd on book3e 64-bit

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/reg.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index d62fdf4e504b..d8be016d2ede 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -890,7 +890,7 @@
 #ifndef __ASSEMBLY__
 #define mfmsr()		({unsigned long rval; \
 			asm volatile("mfmsr %0" : "=r" (rval)); rval;})
-#ifdef CONFIG_PPC64
+#ifdef CONFIG_PPC_BOOK3S_64
 #define __mtmsrd(v, l)	asm volatile("mtmsrd %0," __stringify(l) \
 				     : : "r" (v) : "memory")
 #define mtmsrd(v)	__mtmsrd((v), 0)
-- 
cgit v1.2.3


From a2e198116f97bb1cd5b37ff33a8cfdfb4010cf5b Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 9 Jul 2010 15:24:47 +1000
Subject: powerpc/book3e: Hack to get gdb moving along on Book3E 64-bit

Our handling of debug interrupts on Book3E 64-bit is not quite
the way it should be just yet. This is a workaround to let gdb
work at least for now. We ensure that when context switching,
we set the appropriate DBCR0 value for the new task. We also
make sure that we turn off MSR[DE] within the kernel, and set
it as part of the bits that get set when going back to userspace.

In the long run, we will probably set the userspace DBCR0 on the
exception exit code path and ensure we have some proper kernel
value to set on the way into the kernel, a bit like ppc32 does,
but that will take more work.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/reg_booke.h |  4 ++--
 arch/powerpc/kernel/process.c        | 22 ++++++++++++++++++++++
 2 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
index 2360317179a9..66dc6f0d3ad8 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -29,8 +29,8 @@
 #if defined(CONFIG_PPC_BOOK3E_64)
 #define MSR_		MSR_ME | MSR_CE
 #define MSR_KERNEL      MSR_ | MSR_CM
-#define MSR_USER32	MSR_ | MSR_PR | MSR_EE
-#define MSR_USER64	MSR_USER32 | MSR_CM
+#define MSR_USER32	MSR_ | MSR_PR | MSR_EE | MSR_DE
+#define MSR_USER64	MSR_USER32 | MSR_CM | MSR_DE
 #elif defined (CONFIG_40x)
 #define MSR_KERNEL	(MSR_ME|MSR_RI|MSR_IR|MSR_DR|MSR_CE)
 #define MSR_USER	(MSR_KERNEL|MSR_PR|MSR_EE)
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 1e78453645be..551f6713ff42 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -477,6 +477,28 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	new_thread = &new->thread;
 	old_thread = &current->thread;
 
+#if defined(CONFIG_PPC_BOOK3E_64)
+	/* XXX Current Book3E code doesn't deal with kernel side DBCR0,
+	 * we always hold the user values, so we set it now.
+	 *
+	 * However, we ensure the kernel MSR:DE is appropriately cleared too
+	 * to avoid spurrious single step exceptions in the kernel.
+	 *
+	 * This will have to change to merge with the ppc32 code at some point,
+	 * but I don't like much what ppc32 is doing today so there's some
+	 * thinking needed there
+	 */
+	if ((new_thread->dbcr0 | old_thread->dbcr0) & DBCR0_IDM) {
+		u32 dbcr0;
+
+		mtmsr(mfmsr() & ~MSR_DE);
+		isync();
+		dbcr0 = mfspr(SPRN_DBCR0);
+		dbcr0 = (dbcr0 & DBCR0_EDM) | new_thread->dbcr0;
+		mtspr(SPRN_DBCR0, dbcr0);
+	}
+#endif /* CONFIG_PPC64_BOOK3E */
+
 #ifdef CONFIG_PPC64
 	/*
 	 * Collect processor utilization data per process
-- 
cgit v1.2.3


From b9f1cd71dbf21a91fb7e2336a1d1ff18b97771e5 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 9 Jul 2010 15:29:53 +1000
Subject: powerpc/book3e: More doorbell cleanups. Sample the PIR register

The doorbells use the content of the PIR register to match messages
from other CPUs. This may or may not be the same as our linux CPU
number, so using that as the "target" is no right.

Instead, we sample the PIR register at boot on every processor
and use that value subsequently when sending IPIs.

We also use a per-cpu message mask rather than a global array which
should limit cache line contention.

Note: We could use the CPU number in the device-tree instead of
the PIR register, as they are supposed to be equivalent. This
might prove useful if doorbells are to be used to kick CPUs out
of FW at boot time, thus before we can sample the PIR. This is
however not the case now and using the PIR just works.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/dbell.h  |  7 +++---
 arch/powerpc/kernel/dbell.c       | 47 ++++++++++++++++++++++++++++-----------
 arch/powerpc/platforms/85xx/smp.c |  4 +++-
 3 files changed, 40 insertions(+), 18 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h
index 501189a543d1..ced7e487ca27 100644
--- a/arch/powerpc/include/asm/dbell.h
+++ b/arch/powerpc/include/asm/dbell.h
@@ -27,10 +27,9 @@ enum ppc_dbell {
 	PPC_G_DBELL_MC = 4,	/* guest mcheck doorbell */
 };
 
-#ifdef CONFIG_SMP
-extern unsigned long dbell_smp_message[NR_CPUS];
-extern void smp_dbell_message_pass(int target, int msg);
-#endif
+extern void doorbell_message_pass(int target, int msg);
+extern void doorbell_exception(struct pt_regs *regs);
+extern void doorbell_setup_this_cpu(void);
 
 static inline void ppc_msgsnd(enum ppc_dbell type, u32 flags, u32 tag)
 {
diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c
index e3a717704fd6..1c7a94580c3f 100644
--- a/arch/powerpc/kernel/dbell.c
+++ b/arch/powerpc/kernel/dbell.c
@@ -13,45 +13,66 @@
 #include <linux/kernel.h>
 #include <linux/smp.h>
 #include <linux/threads.h>
+#include <linux/percpu.h>
 
 #include <asm/dbell.h>
 
 #ifdef CONFIG_SMP
-unsigned long dbell_smp_message[NR_CPUS];
+struct doorbell_cpu_info {
+	unsigned long	messages;	/* current messages bits */
+	unsigned int	tag;		/* tag value */
+};
 
-void smp_dbell_message_pass(int target, int msg)
+static DEFINE_PER_CPU(struct doorbell_cpu_info, doorbell_cpu_info);
+
+void doorbell_setup_this_cpu(void)
+{
+	struct doorbell_cpu_info *info = &__get_cpu_var(doorbell_cpu_info);
+
+	info->messages = 0;
+	info->tag = mfspr(SPRN_PIR) & 0x3fff;
+}
+
+void doorbell_message_pass(int target, int msg)
 {
+	struct doorbell_cpu_info *info;
 	int i;
 
-	if(target < NR_CPUS) {
-		set_bit(msg, &dbell_smp_message[target]);
-		ppc_msgsnd(PPC_DBELL, 0, target);
+	if (target < NR_CPUS) {
+		info = &per_cpu(doorbell_cpu_info, target);
+		set_bit(msg, &info->messages);
+		ppc_msgsnd(PPC_DBELL, 0, info->tag);
 	}
-	else if(target == MSG_ALL_BUT_SELF) {
+	else if (target == MSG_ALL_BUT_SELF) {
 		for_each_online_cpu(i) {
 			if (i == smp_processor_id())
 				continue;
-			set_bit(msg, &dbell_smp_message[i]);
-			ppc_msgsnd(PPC_DBELL, 0, i);
+			info = &per_cpu(doorbell_cpu_info, i);
+			set_bit(msg, &info->messages);
+			ppc_msgsnd(PPC_DBELL, 0, info->tag);
 		}
 	}
 	else { /* target == MSG_ALL */
-		for_each_online_cpu(i)
-			set_bit(msg, &dbell_smp_message[i]);
+		for_each_online_cpu(i) {
+			info = &per_cpu(doorbell_cpu_info, i);
+			set_bit(msg, &info->messages);
+		}
 		ppc_msgsnd(PPC_DBELL, PPC_DBELL_MSG_BRDCAST, 0);
 	}
 }
 
 void doorbell_exception(struct pt_regs *regs)
 {
-	int cpu = smp_processor_id();
+	struct doorbell_cpu_info *info = &__get_cpu_var(doorbell_cpu_info);
 	int msg;
 
-	if (num_online_cpus() < 2)
+	/* Warning: regs can be NULL when called from irq enable */
+
+	if (!info->messages || (num_online_cpus() < 2))
 		return;
 
 	for (msg = 0; msg < 4; msg++)
-		if (test_and_clear_bit(msg, &dbell_smp_message[cpu]))
+		if (test_and_clear_bit(msg, &info->messages))
 			smp_message_recv(msg);
 }
 
diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c
index a15f582300d8..4c3cde911c71 100644
--- a/arch/powerpc/platforms/85xx/smp.c
+++ b/arch/powerpc/platforms/85xx/smp.c
@@ -99,6 +99,8 @@ static void __init
 smp_85xx_setup_cpu(int cpu_nr)
 {
 	mpic_setup_this_cpu();
+	if (cpu_has_feature(CPU_FTR_DBELL))
+		doorbell_setup_this_cpu();
 }
 
 struct smp_ops_t smp_85xx_ops = {
@@ -117,7 +119,7 @@ void __init mpc85xx_smp_init(void)
 	}
 
 	if (cpu_has_feature(CPU_FTR_DBELL))
-		smp_85xx_ops.message_pass = smp_dbell_message_pass;
+		smp_85xx_ops.message_pass = doorbell_message_pass;
 
 	BUG_ON(!smp_85xx_ops.message_pass);
 
-- 
cgit v1.2.3


From 850f22d5688941ea51628f3f8f8dcf3baff409ff Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Fri, 9 Jul 2010 15:34:00 +1000
Subject: powerpc/book3e: Resend doorbell exceptions to ourself

If we are soft disabled and receive a doorbell exception we don't process
it immediately. This means we need to check on the way out of irq restore
if there are any doorbell exceptions to process.

The problem is at that point we don't know what our regs are, and that
in turn makes xmon unhappy. To workaround the problem, instead of checking
for and processing doorbells, we check for any doorbells and if there were
any we send ourselves another.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/dbell.h |  1 +
 arch/powerpc/kernel/dbell.c      | 10 ++++++++++
 arch/powerpc/kernel/irq.c        |  4 ++--
 3 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h
index ced7e487ca27..0893ab9343a6 100644
--- a/arch/powerpc/include/asm/dbell.h
+++ b/arch/powerpc/include/asm/dbell.h
@@ -29,6 +29,7 @@ enum ppc_dbell {
 
 extern void doorbell_message_pass(int target, int msg);
 extern void doorbell_exception(struct pt_regs *regs);
+extern void doorbell_check_self(void);
 extern void doorbell_setup_this_cpu(void);
 
 static inline void ppc_msgsnd(enum ppc_dbell type, u32 flags, u32 tag)
diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c
index f7b518894c80..3307a52d797f 100644
--- a/arch/powerpc/kernel/dbell.c
+++ b/arch/powerpc/kernel/dbell.c
@@ -81,6 +81,16 @@ out:
 	set_irq_regs(old_regs);
 }
 
+void doorbell_check_self(void)
+{
+	struct doorbell_cpu_info *info = &__get_cpu_var(doorbell_cpu_info);
+
+	if (!info->messages)
+		return;
+
+	ppc_msgsnd(PPC_DBELL, 0, info->tag);
+}
+
 #else /* CONFIG_SMP */
 void doorbell_exception(struct pt_regs *regs)
 {
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 2f6dc7faf6de..8f96d3198905 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -156,8 +156,8 @@ notrace void raw_local_irq_restore(unsigned long en)
 		return;
 
 #if defined(CONFIG_BOOKE) && defined(CONFIG_SMP)
-	/* Check for pending doorbell interrupts on SMP */
-	doorbell_exception(NULL);
+	/* Check for pending doorbell interrupts and resend to ourself */
+	doorbell_check_self();
 #endif
 
 	/*
-- 
cgit v1.2.3


From 34d97e07cc81ab6f1e63696127cc7a5d2c4fce4b Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 14 Jul 2010 14:12:16 +1000
Subject: powerpc/book3e: Add generic 64-bit idle powersave support

We use a similar technique to ppc32: We set a thread local flag
to indicate that we are about to enter or have entered the stop
state, and have fixup code in the async interrupt entry code that
reacts to this flag to make us return to a different location
(sets NIP to LINK in our case).

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
--
v2. Fix lockdep bug
    Re-mask interrupts when coming back from idle
---
 arch/powerpc/include/asm/machdep.h   |  1 +
 arch/powerpc/kernel/Makefile         |  2 +-
 arch/powerpc/kernel/exceptions-64e.S | 23 ++++++++++
 arch/powerpc/kernel/idle_book3e.S    | 86 ++++++++++++++++++++++++++++++++++++
 4 files changed, 111 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/kernel/idle_book3e.S

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 2bad6e5855ad..adc8e6cdf339 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -278,6 +278,7 @@ extern void e500_idle(void);
 extern void power4_idle(void);
 extern void power4_cpu_offline_powersave(void);
 extern void ppc6xx_idle(void);
+extern void book3e_idle(void);
 
 /*
  * ppc_md contains a copy of the machine description structure for the
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 8a33318fa46b..77d831a1cc32 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -37,7 +37,7 @@ obj-$(CONFIG_PPC64)		+= setup_64.o sys_ppc32.o \
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
 obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_ppc970.o cpu_setup_pa6t.o
 obj64-$(CONFIG_RELOCATABLE)	+= reloc_64.o
-obj-$(CONFIG_PPC_BOOK3E_64)	+= exceptions-64e.o
+obj-$(CONFIG_PPC_BOOK3E_64)	+= exceptions-64e.o idle_book3e.o
 obj-$(CONFIG_PPC64)		+= vdso64/
 obj-$(CONFIG_ALTIVEC)		+= vecemu.o
 obj-$(CONFIG_PPC_970_NAP)	+= idle_power4.o
diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
index a42637c3a72d..316465a32a9c 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -204,11 +204,30 @@ exc_##n##_bad_stack:							    \
 	lis	r,TSR_FIS@h;						\
 	mtspr	SPRN_TSR,r
 
+/* Used by asynchronous interrupt that may happen in the idle loop.
+ *
+ * This check if the thread was in the idle loop, and if yes, returns
+ * to the caller rather than the PC. This is to avoid a race if
+ * interrupts happen before the wait instruction.
+ */
+#define CHECK_NAPPING()							\
+	clrrdi	r11,r1,THREAD_SHIFT;					\
+	ld	r10,TI_LOCAL_FLAGS(r11);				\
+	andi.	r9,r10,_TLF_NAPPING;					\
+	beq+	1f;							\
+	ld	r8,_LINK(r1);						\
+	rlwinm	r7,r10,0,~_TLF_NAPPING;					\
+	std	r8,_NIP(r1);						\
+	std	r7,TI_LOCAL_FLAGS(r11);					\
+1:
+
+
 #define MASKABLE_EXCEPTION(trapnum, label, hdlr, ack)			\
 	START_EXCEPTION(label);						\
 	NORMAL_EXCEPTION_PROLOG(trapnum, PROLOG_ADDITION_MASKABLE)	\
 	EXCEPTION_COMMON(trapnum, PACA_EXGEN, INTS_DISABLE_ALL)		\
 	ack(r8);							\
+	CHECK_NAPPING();						\
 	addi	r3,r1,STACK_FRAME_OVERHEAD;				\
 	bl	hdlr;							\
 	b	.ret_from_except_lite;
@@ -257,6 +276,7 @@ interrupt_end_book3e:
 	CRIT_EXCEPTION_PROLOG(0x100, PROLOG_ADDITION_NONE)
 //	EXCEPTION_COMMON(0x100, PACA_EXCRIT, INTS_DISABLE_ALL)
 //	bl	special_reg_save_crit
+//	CHECK_NAPPING();
 //	addi	r3,r1,STACK_FRAME_OVERHEAD
 //	bl	.critical_exception
 //	b	ret_from_crit_except
@@ -268,6 +288,7 @@ interrupt_end_book3e:
 //	EXCEPTION_COMMON(0x200, PACA_EXMC, INTS_DISABLE_ALL)
 //	bl	special_reg_save_mc
 //	addi	r3,r1,STACK_FRAME_OVERHEAD
+//	CHECK_NAPPING();
 //	bl	.machine_check_exception
 //	b	ret_from_mc_except
 	b	.
@@ -338,6 +359,7 @@ interrupt_end_book3e:
 	CRIT_EXCEPTION_PROLOG(0x9f0, PROLOG_ADDITION_NONE)
 //	EXCEPTION_COMMON(0x9f0, PACA_EXCRIT, INTS_DISABLE_ALL)
 //	bl	special_reg_save_crit
+//	CHECK_NAPPING();
 //	addi	r3,r1,STACK_FRAME_OVERHEAD
 //	bl	.unknown_exception
 //	b	ret_from_crit_except
@@ -434,6 +456,7 @@ kernel_dbg_exc:
 	CRIT_EXCEPTION_PROLOG(0x2080, PROLOG_ADDITION_NONE)
 //	EXCEPTION_COMMON(0x2080, PACA_EXCRIT, INTS_DISABLE_ALL)
 //	bl	special_reg_save_crit
+//	CHECK_NAPPING();
 //	addi	r3,r1,STACK_FRAME_OVERHEAD
 //	bl	.doorbell_critical_exception
 //	b	ret_from_crit_except
diff --git a/arch/powerpc/kernel/idle_book3e.S b/arch/powerpc/kernel/idle_book3e.S
new file mode 100644
index 000000000000..16c002d6bdf1
--- /dev/null
+++ b/arch/powerpc/kernel/idle_book3e.S
@@ -0,0 +1,86 @@
+/*
+ * Copyright 2010 IBM Corp, Benjamin Herrenschmidt <benh@kernel.crashing.org>
+ *
+ * Generic idle routine for Book3E processors
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/threads.h>
+#include <asm/reg.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ppc-opcode.h>
+#include <asm/processor.h>
+#include <asm/thread_info.h>
+
+/* 64-bit version only for now */
+#ifdef CONFIG_PPC64
+
+_GLOBAL(book3e_idle)
+	/* Save LR for later */
+	mflr	r0
+	std	r0,16(r1)
+
+	/* Hard disable interrupts */
+	wrteei	0
+
+	/* Now check if an interrupt came in while we were soft disabled
+	 * since we may otherwise lose it (doorbells etc...). We know
+	 * that since PACAHARDIRQEN will have been cleared in that case.
+	 */
+	lbz	r3,PACAHARDIRQEN(r13)
+	cmpwi	cr0,r3,0
+	beqlr
+
+	/* Now we are going to mark ourselves as soft and hard enables in
+	 * order to be able to take interrupts while asleep. We inform lockdep
+	 * of that. We don't actually turn interrupts on just yet tho.
+	 */
+#ifdef CONFIG_TRACE_IRQFLAGS
+	stdu    r1,-128(r1)
+	bl	.trace_hardirqs_on
+#endif
+	li	r0,1
+	stb	r0,PACASOFTIRQEN(r13)
+	stb	r0,PACAHARDIRQEN(r13)
+	
+	/* Interrupts will make use return to LR, so get something we want
+	 * in there
+	 */
+	bl	1f
+
+	/* Hard disable interrupts again */
+	wrteei	0
+
+	/* Mark them off again in the PACA as well */
+	li	r0,0
+	stb	r0,PACASOFTIRQEN(r13)
+	stb	r0,PACAHARDIRQEN(r13)
+
+	/* Tell lockdep about it */
+#ifdef CONFIG_TRACE_IRQFLAGS
+	bl	.trace_hardirqs_off
+	addi    r1,r1,128
+#endif
+	ld	r0,16(r1)
+	mtlr	r0
+	blr
+
+1:	/* Let's set the _TLF_NAPPING flag so interrupts make us return
+	 * to the right spot
+	*/
+	clrrdi	r11,r1,THREAD_SHIFT
+	ld	r10,TI_LOCAL_FLAGS(r11)
+	ori	r10,r10,_TLF_NAPPING
+	std	r10,TI_LOCAL_FLAGS(r11)
+
+	/* We can now re-enable hard interrupts and go to sleep */
+	wrteei	1
+1:	PPC_WAIT(0)
+	b	1b
+
+#endif /* CONFIG_PPC64 */
-- 
cgit v1.2.3


From f2b26c923518e03959142715a2b7615cb161cd16 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 9 Jul 2010 14:57:43 +1000
Subject: powerpc/book3e: Adjust the page sizes list based on MMU config

Use the MMU config registers to scan for available direct and
indirect page sizes and print out the result. Will be needed
for future hugetlbfs implementation.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/mmu-book3e.h |   4 +
 arch/powerpc/include/asm/reg_booke.h  |   1 +
 arch/powerpc/mm/tlb_nohash.c          | 136 ++++++++++++++++++++++++++--------
 3 files changed, 109 insertions(+), 32 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h
index 74695816205c..87a1d787c5b6 100644
--- a/arch/powerpc/include/asm/mmu-book3e.h
+++ b/arch/powerpc/include/asm/mmu-book3e.h
@@ -193,6 +193,10 @@ struct mmu_psize_def
 {
 	unsigned int	shift;	/* number of bits */
 	unsigned int	enc;	/* PTE encoding */
+	unsigned int    ind;    /* Corresponding indirect page size shift */
+	unsigned int	flags;
+#define MMU_PAGE_SIZE_DIRECT	0x1	/* Supported as a direct size */
+#define MMU_PAGE_SIZE_INDIRECT	0x2	/* Supported as an indirect size */
 };
 extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
 
diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
index 66dc6f0d3ad8..667a498eaee1 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -62,6 +62,7 @@
 #define SPRN_TLB0PS	0x158	/* TLB 0 Page Size Register */
 #define SPRN_MAS5_MAS6	0x15c	/* MMU Assist Register 5 || 6 */
 #define SPRN_MAS8_MAS1	0x15d	/* MMU Assist Register 8 || 1 */
+#define SPRN_EPTCFG	0x15e	/* Embedded Page Table Config */
 #define SPRN_MAS7_MAS3	0x174	/* MMU Assist Register 7 || 3 */
 #define SPRN_MAS0_MAS1	0x175	/* MMU Assist Register 0 || 1 */
 #define SPRN_IVOR0	0x190	/* Interrupt Vector Offset Register 0 */
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index 2ce42bf1f67e..3b10f804b735 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -46,6 +46,7 @@
 struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
 	[MMU_PAGE_4K] = {
 		.shift	= 12,
+		.ind	= 20,
 		.enc	= BOOK3E_PAGESZ_4K,
 	},
 	[MMU_PAGE_16K] = {
@@ -54,6 +55,7 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
 	},
 	[MMU_PAGE_64K] = {
 		.shift	= 16,
+		.ind	= 28,
 		.enc	= BOOK3E_PAGESZ_64K,
 	},
 	[MMU_PAGE_1M] = {
@@ -62,6 +64,7 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
 	},
 	[MMU_PAGE_16M] = {
 		.shift	= 24,
+		.ind	= 36,
 		.enc	= BOOK3E_PAGESZ_16M,
 	},
 	[MMU_PAGE_256M] = {
@@ -344,16 +347,108 @@ void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
 	}
 }
 
-/*
- * Early initialization of the MMU TLB code
- */
-static void __early_init_mmu(int boot_cpu)
+static void setup_page_sizes(void)
+{
+	unsigned int tlb0cfg = mfspr(SPRN_TLB0CFG);
+	unsigned int tlb0ps = mfspr(SPRN_TLB0PS);
+	unsigned int eptcfg = mfspr(SPRN_EPTCFG);
+	int i, psize;
+
+	/* Look for supported direct sizes */
+	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+		struct mmu_psize_def *def = &mmu_psize_defs[psize];
+
+		if (tlb0ps & (1U << (def->shift - 10)))
+			def->flags |= MMU_PAGE_SIZE_DIRECT;
+	}
+
+	/* Indirect page sizes supported ? */
+	if ((tlb0cfg & TLBnCFG_IND) == 0)
+		goto no_indirect;
+
+	/* Now, we only deal with one IND page size for each
+	 * direct size. Hopefully all implementations today are
+	 * unambiguous, but we might want to be careful in the
+	 * future.
+	 */
+	for (i = 0; i < 3; i++) {
+		unsigned int ps, sps;
+
+		sps = eptcfg & 0x1f;
+		eptcfg >>= 5;
+		ps = eptcfg & 0x1f;
+		eptcfg >>= 5;
+		if (!ps || !sps)
+			continue;
+		for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
+			struct mmu_psize_def *def = &mmu_psize_defs[psize];
+
+			if (ps == (def->shift - 10))
+				def->flags |= MMU_PAGE_SIZE_INDIRECT;
+			if (sps == (def->shift - 10))
+				def->ind = ps + 10;
+		}
+	}
+ no_indirect:
+
+	/* Cleanup array and print summary */
+	pr_info("MMU: Supported page sizes\n");
+	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+		struct mmu_psize_def *def = &mmu_psize_defs[psize];
+		const char *__page_type_names[] = {
+			"unsupported",
+			"direct",
+			"indirect",
+			"direct & indirect"
+		};
+		if (def->flags == 0) {
+			def->shift = 0;	
+			continue;
+		}
+		pr_info("  %8ld KB as %s\n", 1ul << (def->shift - 10),
+			__page_type_names[def->flags & 0x3]);
+	}
+}
+
+static void setup_mmu_htw(void)
 {
 	extern unsigned int interrupt_base_book3e;
 	extern unsigned int exc_data_tlb_miss_htw_book3e;
 	extern unsigned int exc_instruction_tlb_miss_htw_book3e;
 
 	unsigned int *ibase = &interrupt_base_book3e;
+
+	/* Check if HW tablewalk is present, and if yes, enable it by:
+	 *
+	 * - patching the TLB miss handlers to branch to the
+	 *   one dedicates to it
+	 *
+	 * - setting the global book3e_htw_enabled
+       	 */
+	unsigned int tlb0cfg = mfspr(SPRN_TLB0CFG);
+
+	if ((tlb0cfg & TLBnCFG_IND) &&
+	    (tlb0cfg & TLBnCFG_PT)) {
+		/* Our exceptions vectors start with a NOP and -then- a branch
+		 * to deal with single stepping from userspace which stops on
+		 * the second instruction. Thus we need to patch the second
+		 * instruction of the exception, not the first one
+		 */
+		patch_branch(ibase + (0x1c0 / 4) + 1,
+			     (unsigned long)&exc_data_tlb_miss_htw_book3e, 0);
+		patch_branch(ibase + (0x1e0 / 4) + 1,
+			     (unsigned long)&exc_instruction_tlb_miss_htw_book3e, 0);
+		book3e_htw_enabled = 1;
+	}
+	pr_info("MMU: Book3E Page Tables %s\n",
+		book3e_htw_enabled ? "Enabled" : "Disabled");
+}
+
+/*
+ * Early initialization of the MMU TLB code
+ */
+static void __early_init_mmu(int boot_cpu)
+{
 	unsigned int mas4;
 
 	/* XXX This will have to be decided at runtime, but right
@@ -370,40 +465,17 @@ static void __early_init_mmu(int boot_cpu)
 	 */
 	mmu_vmemmap_psize = MMU_PAGE_16M;
 
-	/* Check if HW tablewalk is present, and if yes, enable it by:
-	 *
-	 * - patching the TLB miss handlers to branch to the
-	 *   one dedicates to it
-	 *
-	 * - setting the global book3e_htw_enabled
-	 *
-	 * - Set MAS4:INDD and default page size
-	 */
-
 	/* XXX This code only checks for TLB 0 capabilities and doesn't
 	 *     check what page size combos are supported by the HW. It
 	 *     also doesn't handle the case where a separate array holds
 	 *     the IND entries from the array loaded by the PT.
 	 */
 	if (boot_cpu) {
-		unsigned int tlb0cfg = mfspr(SPRN_TLB0CFG);
-
-		/* Check if HW loader is supported */
-		if ((tlb0cfg & TLBnCFG_IND) &&
-		    (tlb0cfg & TLBnCFG_PT)) {
-			/* Our exceptions vectors start with a NOP and -then- a branch
-			 * to deal with single stepping from userspace which stops on
-			 * the second instruction. Thus we need to patch the second
-			 * instruction of the exception, not the first one
-			 */
-			patch_branch(ibase + (0x1c0 / 4) + 1,
-				(unsigned long)&exc_data_tlb_miss_htw_book3e, 0);
-			patch_branch(ibase + (0x1e0 / 4) + 1,
-				(unsigned long)&exc_instruction_tlb_miss_htw_book3e, 0);
-			book3e_htw_enabled = 1;
-		}
-		pr_info("MMU: Book3E Page Tables %s\n",
-			book3e_htw_enabled ? "Enabled" : "Disabled");
+		/* Look for supported page sizes */
+		setup_page_sizes();
+
+		/* Look for HW tablewalk support */
+		setup_mmu_htw();
 	}
 
 	/* Set MAS4 based on page table setting */
-- 
cgit v1.2.3


From fc53b4202e61c7e9008c241933ae282aab8a6082 Mon Sep 17 00:00:00 2001
From: Matt Evans <matt@ozlabs.org>
Date: Wed, 7 Jul 2010 21:55:37 +0000
Subject: powerpc/kexec: Switch to a static PACA on the way out

With dynamic PACAs, the kexecing CPU's PACA won't lie within the kernel
static data and there is a chance that something may stomp it when preparing
to kexec.  This patch switches this final CPU to a static PACA just before
we pull the switch.

Signed-off-by: Matt Evans <matt@ozlabs.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/paca.h        |  2 +-
 arch/powerpc/kernel/machine_kexec_64.c | 20 ++++++++++++++++++++
 arch/powerpc/kernel/paca.c             | 10 ++++++++++
 arch/powerpc/kernel/setup_64.c         | 10 ----------
 4 files changed, 31 insertions(+), 11 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 8ce7963ad41d..1ff6662f7faf 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -146,7 +146,7 @@ struct paca_struct {
 extern struct paca_struct *paca;
 extern __initdata struct paca_struct boot_paca;
 extern void initialise_paca(struct paca_struct *new_paca, int cpu);
-
+extern void setup_paca(struct paca_struct *new_paca);
 extern void allocate_pacas(void);
 extern void free_unused_pacas(void);
 
diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c
index de6b70eac51d..022d2f613b7b 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -260,6 +260,12 @@ static void kexec_prepare_cpus(void)
 static union thread_union kexec_stack __init_task_data =
 	{ };
 
+/*
+ * For similar reasons to the stack above, the kexecing CPU needs to be on a
+ * static PACA; we switch to kexec_paca.
+ */
+struct paca_struct kexec_paca;
+
 /* Our assembly helper, in kexec_stub.S */
 extern NORET_TYPE void kexec_sequence(void *newstack, unsigned long start,
 					void *image, void *control,
@@ -287,6 +293,20 @@ void default_machine_kexec(struct kimage *image)
 	kexec_stack.thread_info.task = current_thread_info()->task;
 	kexec_stack.thread_info.flags = 0;
 
+	/* We need a static PACA, too; copy this CPU's PACA over and switch to
+	 * it.  Also poison per_cpu_offset to catch anyone using non-static
+	 * data.
+	 */
+	memcpy(&kexec_paca, get_paca(), sizeof(struct paca_struct));
+	kexec_paca.data_offset = 0xedeaddeadeeeeeeeUL;
+	paca = (struct paca_struct *)RELOC_HIDE(&kexec_paca, 0) -
+		kexec_paca.paca_index;
+	setup_paca(&kexec_paca);
+
+	/* XXX: If anyone does 'dynamic lppacas' this will also need to be
+	 * switched to a static version!
+	 */
+
 	/* Some things are best done in assembly.  Finding globals with
 	 * a toc is easier in C, so pass in what we can.
 	 */
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index f88acf0218db..3db8d64f40e1 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -105,6 +105,16 @@ void __init initialise_paca(struct paca_struct *new_paca, int cpu)
 #endif /* CONFIG_PPC_STD_MMU_64 */
 }
 
+/* Put the paca pointer into r13 and SPRG_PACA */
+void setup_paca(struct paca_struct *new_paca)
+{
+	local_paca = new_paca;
+	mtspr(SPRN_SPRG_PACA, local_paca);
+#ifdef CONFIG_PPC_BOOK3E
+	mtspr(SPRN_SPRG_TLB_EXFRAME, local_paca->extlb);
+#endif
+}
+
 static int __initdata paca_size;
 
 void __init allocate_pacas(void)
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index c352f322dbdd..96e662c1d46b 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -142,16 +142,6 @@ early_param("smt-enabled", early_smt_enabled);
 #define check_smt_enabled()
 #endif /* CONFIG_SMP */
 
-/* Put the paca pointer into r13 and SPRG_PACA */
-static void __init setup_paca(struct paca_struct *new_paca)
-{
-	local_paca = new_paca;
-	mtspr(SPRN_SPRG_PACA, local_paca);
-#ifdef CONFIG_PPC_BOOK3E
-	mtspr(SPRN_SPRG_TLB_EXFRAME, local_paca->extlb);
-#endif
-}
-
 /*
  * Early initialization entry point. This is called by head.S
  * with MMU translation disabled. We rely on the "feature" of
-- 
cgit v1.2.3


From 4b5006ec7bb73cd9d4c8a723d484b4c87fad4123 Mon Sep 17 00:00:00 2001
From: Anatolij Gustschin <agust@denx.de>
Date: Fri, 23 Jul 2010 04:00:37 +0000
Subject: powerpc/5121: shared DIU framebuffer support

MPC5121 DIU configuration/setup as initialized by the boot
loader currently will get lost while booting Linux. As a
result displaying the boot splash is not possible through
the boot process.

To prevent this we reserve configured DIU frame buffer
address range while booting and preserve AOI descriptor
and gamma table so that DIU continues displaying through
the whole boot process. On first open from user space
DIU frame buffer driver releases the reserved frame
buffer area and continues to operate as usual.

Signed-off-by: John Rigby <jcrigby@gmail.com>
Signed-off-by: Anatolij Gustschin <agust@denx.de>
Acked-by: Timur Tabi <timur@freescale.com>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 arch/powerpc/include/asm/mpc5121.h            |  32 +++
 arch/powerpc/platforms/512x/mpc5121_ads.c     |   2 +
 arch/powerpc/platforms/512x/mpc5121_generic.c |   2 +
 arch/powerpc/platforms/512x/mpc512x.h         |   2 +
 arch/powerpc/platforms/512x/mpc512x_shared.c  | 284 ++++++++++++++++++++++++++
 arch/powerpc/sysdev/fsl_soc.h                 |   1 +
 drivers/video/fsl-diu-fb.c                    |  17 +-
 7 files changed, 338 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc/include')

diff --git a/arch/powerpc/include/asm/mpc5121.h b/arch/powerpc/include/asm/mpc5121.h
index e6a30bb1d16a..8c0ab2ca689c 100644
--- a/arch/powerpc/include/asm/mpc5121.h
+++ b/arch/powerpc/include/asm/mpc5121.h
@@ -21,4 +21,36 @@ struct mpc512x_reset_module {
 	u32	rcer;	/* Reset Control Enable Register */
 };
 
+/*
+ * Clock Control Module
+ */
+struct mpc512x_ccm {
+	u32	spmr;	/* System PLL Mode Register */
+	u32	sccr1;	/* System Clock Control Register 1 */
+	u32	sccr2;	/* System Clock Control Register 2 */
+	u32	scfr1;	/* System Clock Frequency Register 1 */
+	u32	scfr2;	/* System Clock Frequency Register 2 */
+	u32	scfr2s;	/* System Clock Frequency Shadow Register 2 */
+	u32	bcr;	/* Bread Crumb Register */
+	u32	p0ccr;	/* PSC0 Clock Control Register */
+	u32	p1ccr;	/* PSC1 CCR */
+	u32	p2ccr;	/* PSC2 CCR */
+	u32	p3ccr;	/* PSC3 CCR */
+	u32	p4ccr;	/* PSC4 CCR */
+	u32	p5ccr;	/* PSC5 CCR */
+	u32	p6ccr;	/* PSC6 CCR */
+	u32	p7ccr;	/* PSC7 CCR */
+	u32	p8ccr;	/* PSC8 CCR */
+	u32	p9ccr;	/* PSC9 CCR */
+	u32	p10ccr;	/* PSC10 CCR */
+	u32	p11ccr;	/* PSC11 CCR */
+	u32	spccr;	/* SPDIF Clock Control Register */
+	u32	cccr;	/* CFM Clock Control Register */
+	u32	dccr;	/* DIU Clock Control Register */
+	u32	m1ccr;	/* MSCAN1 CCR */
+	u32	m2ccr;	/* MSCAN2 CCR */
+	u32	m3ccr;	/* MSCAN3 CCR */
+	u32	m4ccr;	/* MSCAN4 CCR */
+	u8	res[0x98]; /* Reserved */
+};
 #endif /* __ASM_POWERPC_MPC5121_H__ */
diff --git a/arch/powerpc/platforms/512x/mpc5121_ads.c b/arch/powerpc/platforms/512x/mpc5121_ads.c
index ee6ae129c25c..dcef6ade48e1 100644
--- a/arch/powerpc/platforms/512x/mpc5121_ads.c
+++ b/arch/powerpc/platforms/512x/mpc5121_ads.c
@@ -42,6 +42,7 @@ static void __init mpc5121_ads_setup_arch(void)
 	for_each_compatible_node(np, "pci", "fsl,mpc5121-pci")
 		mpc83xx_add_bridge(np);
 #endif
+	mpc512x_setup_diu();
 }
 
 static void __init mpc5121_ads_init_IRQ(void)
@@ -65,6 +66,7 @@ define_machine(mpc5121_ads) {
 	.probe			= mpc5121_ads_probe,
 	.setup_arch		= mpc5121_ads_setup_arch,
 	.init			= mpc512x_init,
+	.init_early		= mpc512x_init_diu,
 	.init_IRQ		= mpc5121_ads_init_IRQ,
 	.get_irq		= ipic_get_irq,
 	.calibrate_decr		= generic_calibrate_decr,
diff --git a/arch/powerpc/platforms/512x/mpc5121_generic.c b/arch/powerpc/platforms/512x/mpc5121_generic.c
index a6c0e3a2615d..e487eb06ec6b 100644
--- a/arch/powerpc/platforms/512x/mpc5121_generic.c
+++ b/arch/powerpc/platforms/512x/mpc5121_generic.c
@@ -52,6 +52,8 @@ define_machine(mpc5121_generic) {
 	.name			= "MPC5121 generic",
 	.probe			= mpc5121_generic_probe,
 	.init			= mpc512x_init,
+	.init_early		= mpc512x_init_diu,
+	.setup_arch		= mpc512x_setup_diu,
 	.init_IRQ		= mpc512x_init_IRQ,
 	.get_irq		= ipic_get_irq,
 	.calibrate_decr		= generic_calibrate_decr,
diff --git a/arch/powerpc/platforms/512x/mpc512x.h b/arch/powerpc/platforms/512x/mpc512x.h
index b2daca0d1488..1ab6d11d0b19 100644
--- a/arch/powerpc/platforms/512x/mpc512x.h
+++ b/arch/powerpc/platforms/512x/mpc512x.h
@@ -16,4 +16,6 @@ extern void __init mpc512x_init(void);
 extern int __init mpc5121_clk_init(void);
 void __init mpc512x_declare_of_platform_devices(void);
 extern void mpc512x_restart(char *cmd);
+extern void mpc512x_init_diu(void);
+extern void mpc512x_setup_diu(void);
 #endif				/* __MPC512X_H__ */
diff --git a/arch/powerpc/platforms/512x/mpc512x_shared.c b/arch/powerpc/platforms/512x/mpc512x_shared.c
index 707e572b7c40..e41ebbdb3e12 100644
--- a/arch/powerpc/platforms/512x/mpc512x_shared.c
+++ b/arch/powerpc/platforms/512x/mpc512x_shared.c
@@ -16,7 +16,11 @@
 #include <linux/io.h>
 #include <linux/irq.h>
 #include <linux/of_platform.h>
+#include <linux/fsl-diu-fb.h>
+#include <linux/bootmem.h>
+#include <sysdev/fsl_soc.h>
 
+#include <asm/cacheflush.h>
 #include <asm/machdep.h>
 #include <asm/ipic.h>
 #include <asm/prom.h>
@@ -54,6 +58,286 @@ void mpc512x_restart(char *cmd)
 		;
 }
 
+struct fsl_diu_shared_fb {
+	u8		gamma[0x300];	/* 32-bit aligned! */
+	struct diu_ad	ad0;		/* 32-bit aligned! */
+	phys_addr_t	fb_phys;
+	size_t		fb_len;
+	bool		in_use;
+};
+
+unsigned int mpc512x_get_pixel_format(unsigned int bits_per_pixel,
+				      int monitor_port)
+{
+	switch (bits_per_pixel) {
+	case 32:
+		return 0x88883316;
+	case 24:
+		return 0x88082219;
+	case 16:
+		return 0x65053118;
+	}
+	return 0x00000400;
+}
+
+void mpc512x_set_gamma_table(int monitor_port, char *gamma_table_base)
+{
+}
+
+void mpc512x_set_monitor_port(int monitor_port)
+{
+}
+
+#define DIU_DIV_MASK	0x000000ff
+void mpc512x_set_pixel_clock(unsigned int pixclock)
+{
+	unsigned long bestval, bestfreq, speed, busfreq;
+	unsigned long minpixclock, maxpixclock, pixval;
+	struct mpc512x_ccm __iomem *ccm;
+	struct device_node *np;
+	u32 temp;
+	long err;
+	int i;
+
+	np = of_find_compatible_node(NULL, NULL, "fsl,mpc5121-clock");
+	if (!np) {
+		pr_err("Can't find clock control module.\n");
+		return;
+	}
+
+	ccm = of_iomap(np, 0);
+	of_node_put(np);
+	if (!ccm) {
+		pr_err("Can't map clock control module reg.\n");
+		return;
+	}
+
+	np = of_find_node_by_type(NULL, "cpu");
+	if (np) {
+		const unsigned int *prop =
+			of_get_property(np, "bus-frequency", NULL);
+
+		of_node_put(np);
+		if (prop) {
+			busfreq = *prop;
+		} else {
+			pr_err("Can't get bus-frequency property\n");
+			return;
+		}
+	} else {
+		pr_err("Can't find 'cpu' node.\n");
+		return;
+	}
+
+	/* Pixel Clock configuration */
+	pr_debug("DIU: Bus Frequency = %lu\n", busfreq);
+	speed = busfreq * 4; /* DIU_DIV ratio is 4 * CSB_CLK / DIU_CLK */
+
+	/* Calculate the pixel clock with the smallest error */
+	/* calculate the following in steps to avoid overflow */
+	pr_debug("DIU pixclock in ps - %d\n", pixclock);
+	temp = (1000000000 / pixclock) * 1000;
+	pixclock = temp;
+	pr_debug("DIU pixclock freq - %u\n", pixclock);
+
+	temp = temp / 20; /* pixclock * 0.05 */
+	pr_debug("deviation = %d\n", temp);
+	minpixclock = pixclock - temp;
+	maxpixclock = pixclock + temp;
+	pr_debug("DIU minpixclock - %lu\n", minpixclock);
+	pr_debug("DIU maxpixclock - %lu\n", maxpixclock);
+	pixval = speed/pixclock;
+	pr_debug("DIU pixval = %lu\n", pixval);
+
+	err = LONG_MAX;
+	bestval = pixval;
+	pr_debug("DIU bestval = %lu\n", bestval);
+
+	bestfreq = 0;
+	for (i = -1; i <= 1; i++) {
+		temp = speed / (pixval+i);
+		pr_debug("DIU test pixval i=%d, pixval=%lu, temp freq. = %u\n",
+			i, pixval, temp);
+		if ((temp < minpixclock) || (temp > maxpixclock))
+			pr_debug("DIU exceeds monitor range (%lu to %lu)\n",
+				minpixclock, maxpixclock);
+		else if (abs(temp - pixclock) < err) {
+			pr_debug("Entered the else if block %d\n", i);
+			err = abs(temp - pixclock);
+			bestval = pixval + i;
+			bestfreq = temp;
+		}
+	}
+
+	pr_debug("DIU chose = %lx\n", bestval);
+	pr_debug("DIU error = %ld\n NomPixClk ", err);
+	pr_debug("DIU: Best Freq = %lx\n", bestfreq);
+	/* Modify DIU_DIV in CCM SCFR1 */
+	temp = in_be32(&ccm->scfr1);
+	pr_debug("DIU: Current value of SCFR1: 0x%08x\n", temp);
+	temp &= ~DIU_DIV_MASK;
+	temp |= (bestval & DIU_DIV_MASK);
+	out_be32(&ccm->scfr1, temp);
+	pr_debug("DIU: Modified value of SCFR1: 0x%08x\n", temp);
+	iounmap(ccm);
+}
+
+ssize_t mpc512x_show_monitor_port(int monitor_port, char *buf)
+{
+	return sprintf(buf, "0 - 5121 LCD\n");
+}
+
+int mpc512x_set_sysfs_monitor_port(int val)
+{
+	return 0;
+}
+
+static struct fsl_diu_shared_fb __attribute__ ((__aligned__(8))) diu_shared_fb;
+
+#if defined(CONFIG_FB_FSL_DIU) || \
+    defined(CONFIG_FB_FSL_DIU_MODULE)
+static inline void mpc512x_free_bootmem(struct page *page)
+{
+	__ClearPageReserved(page);
+	BUG_ON(PageTail(page));
+	BUG_ON(atomic_read(&page->_count) > 1);
+	atomic_set(&page->_count, 1);
+	__free_page(page);
+	totalram_pages++;
+}
+
+void mpc512x_release_bootmem(void)
+{
+	unsigned long addr = diu_shared_fb.fb_phys & PAGE_MASK;
+	unsigned long size = diu_shared_fb.fb_len;
+	unsigned long start, end;
+
+	if (diu_shared_fb.in_use) {
+		start = PFN_UP(addr);
+		end = PFN_DOWN(addr + size);
+
+		for (; start < end; start++)
+			mpc512x_free_bootmem(pfn_to_page(start));
+
+		diu_shared_fb.in_use = false;
+	}
+	diu_ops.release_bootmem	= NULL;
+}
+#endif
+
+/*
+ * Check if DIU was pre-initialized. If so, perform steps
+ * needed to continue displaying through the whole boot process.
+ * Move area descriptor and gamma table elsewhere, they are
+ * destroyed by bootmem allocator otherwise. The frame buffer
+ * address range will be reserved in setup_arch() after bootmem
+ * allocator is up.
+ */
+void __init mpc512x_init_diu(void)
+{
+	struct device_node *np;
+	struct diu __iomem *diu_reg;
+	phys_addr_t desc;
+	void __iomem *vaddr;
+	unsigned long mode, pix_fmt, res, bpp;
+	unsigned long dst;
+
+	np = of_find_compatible_node(NULL, NULL, "fsl,mpc5121-diu");
+	if (!np) {
+		pr_err("No DIU node\n");
+		return;
+	}
+
+	diu_reg = of_iomap(np, 0);
+	of_node_put(np);
+	if (!diu_reg) {
+		pr_err("Can't map DIU\n");
+		return;
+	}
+
+	mode = in_be32(&diu_reg->diu_mode);
+	if (mode != MFB_MODE1) {
+		pr_info("%s: DIU OFF\n", __func__);
+		goto out;
+	}
+
+	desc = in_be32(&diu_reg->desc[0]);
+	vaddr = ioremap(desc, sizeof(struct diu_ad));
+	if (!vaddr) {
+		pr_err("Can't map DIU area desc.\n");
+		goto out;
+	}
+	memcpy(&diu_shared_fb.ad0, vaddr, sizeof(struct diu_ad));
+	/* flush fb area descriptor */
+	dst = (unsigned long)&diu_shared_fb.ad0;
+	flush_dcache_range(dst, dst + sizeof(struct diu_ad) - 1);
+
+	res = in_be32(&diu_reg->disp_size);
+	pix_fmt = in_le32(vaddr);
+	bpp = ((pix_fmt >> 16) & 0x3) + 1;
+	diu_shared_fb.fb_phys = in_le32(vaddr + 4);
+	diu_shared_fb.fb_len = ((res & 0xfff0000) >> 16) * (res & 0xfff) * bpp;
+	diu_shared_fb.in_use = true;
+	iounmap(vaddr);
+
+	desc = in_be32(&diu_reg->gamma);
+	vaddr = ioremap(desc, sizeof(diu_shared_fb.gamma));
+	if (!vaddr) {
+		pr_err("Can't map DIU area desc.\n");
+		diu_shared_fb.in_use = false;
+		goto out;
+	}
+	memcpy(&diu_shared_fb.gamma, vaddr, sizeof(diu_shared_fb.gamma));
+	/* flush gamma table */
+	dst = (unsigned long)&diu_shared_fb.gamma;
+	flush_dcache_range(dst, dst + sizeof(diu_shared_fb.gamma) - 1);
+
+	iounmap(vaddr);
+	out_be32(&diu_reg->gamma, virt_to_phys(&diu_shared_fb.gamma));
+	out_be32(&diu_reg->desc[1], 0);
+	out_be32(&diu_reg->desc[2], 0);
+	out_be32(&diu_reg->desc[0], virt_to_phys(&diu_shared_fb.ad0));
+
+out:
+	iounmap(diu_reg);
+}
+
+void __init mpc512x_setup_diu(void)
+{
+	int ret;
+
+	/*
+	 * We do not allocate and configure new area for bitmap buffer
+	 * because it would requere copying bitmap data (splash image)
+	 * and so negatively affect boot time. Instead we reserve the
+	 * already configured frame buffer area so that it won't be
+	 * destroyed. The starting address of the area to reserve and
+	 * also it's length is passed to reserve_bootmem(). It will be
+	 * freed later on first open of fbdev, when splash image is not
+	 * needed any more.
+	 */
+	if (diu_shared_fb.in_use) {
+		ret = reserve_bootmem(diu_shared_fb.fb_phys,
+				      diu_shared_fb.fb_len,
+				      BOOTMEM_EXCLUSIVE);
+		if (ret) {
+			pr_err("%s: reserve bootmem failed\n", __func__);
+			diu_shared_fb.in_use = false;
+		}
+	}
+
+#if defined(CONFIG_FB_FSL_DIU) || \
+    defined(CONFIG_FB_FSL_DIU_MODULE)
+	diu_ops.get_pixel_format	= mpc512x_get_pixel_format;
+	diu_ops.set_gamma_table		= mpc512x_set_gamma_table;
+	diu_ops.set_monitor_port	= mpc512x_set_monitor_port;
+	diu_ops.set_pixel_clock		= mpc512x_set_pixel_clock;
+	diu_ops.show_monitor_port	= mpc512x_show_monitor_port;
+	diu_ops.set_sysfs_monitor_port	= mpc512x_set_sysfs_monitor_port;
+	diu_ops.release_bootmem		= mpc512x_release_bootmem;
+#endif
+}
+
 void __init mpc512x_init_IRQ(void)
 {
 	struct device_node *np;
diff --git a/arch/powerpc/sysdev/fsl_soc.h b/arch/powerpc/sysdev/fsl_soc.h
index 42381bb6cd51..53609489a62b 100644
--- a/arch/powerpc/sysdev/fsl_soc.h
+++ b/arch/powerpc/sysdev/fsl_soc.h
@@ -30,6 +30,7 @@ struct platform_diu_data_ops {
 	void (*set_pixel_clock) (unsigned int pixclock);
 	ssize_t (*show_monitor_port) (int monitor_port, char *buf);
 	int (*set_sysfs_monitor_port) (int val);
+	void (*release_bootmem) (void);
 };
 
 extern struct platform_diu_data_ops diu_ops;
diff --git a/drivers/video/fsl-diu-fb.c b/drivers/video/fsl-diu-fb.c
index 48905d5f4e8d..db3e360e6aad 100644
--- a/drivers/video/fsl-diu-fb.c
+++ b/drivers/video/fsl-diu-fb.c
@@ -1108,6 +1108,10 @@ static int fsl_diu_open(struct fb_info *info, int user)
 	struct mfb_info *mfbi = info->par;
 	int res = 0;
 
+	/* free boot splash memory on first /dev/fb0 open */
+	if (!mfbi->index && diu_ops.release_bootmem)
+		diu_ops.release_bootmem();
+
 	spin_lock(&diu_lock);
 	mfbi->count++;
 	if (mfbi->count == 1) {
@@ -1435,6 +1439,7 @@ static int __devinit fsl_diu_probe(struct of_device *ofdev,
 	int ret, i, error = 0;
 	struct resource res;
 	struct fsl_diu_data *machine_data;
+	int diu_mode;
 
 	machine_data = kzalloc(sizeof(struct fsl_diu_data), GFP_KERNEL);
 	if (!machine_data)
@@ -1471,7 +1476,9 @@ static int __devinit fsl_diu_probe(struct of_device *ofdev,
 		goto error2;
 	}
 
-	out_be32(&dr.diu_reg->diu_mode, 0);		/* disable DIU anyway*/
+	diu_mode = in_be32(&dr.diu_reg->diu_mode);
+	if (diu_mode != MFB_MODE1)
+		out_be32(&dr.diu_reg->diu_mode, 0);	/* disable DIU */
 
 	/* Get the IRQ of the DIU */
 	machine_data->irq = irq_of_parse_and_map(np, 0);
@@ -1519,7 +1526,13 @@ static int __devinit fsl_diu_probe(struct of_device *ofdev,
 	machine_data->dummy_ad->offset_xyd = 0;
 	machine_data->dummy_ad->next_ad = 0;
 
-	out_be32(&dr.diu_reg->desc[0], machine_data->dummy_ad->paddr);
+	/*
+	 * Let DIU display splash screen if it was pre-initialized
+	 * by the bootloader, set dummy area descriptor otherwise.
+	 */
+	if (diu_mode != MFB_MODE1)
+		out_be32(&dr.diu_reg->desc[0], machine_data->dummy_ad->paddr);
+
 	out_be32(&dr.diu_reg->desc[1], machine_data->dummy_ad->paddr);
 	out_be32(&dr.diu_reg->desc[2], machine_data->dummy_ad->paddr);
 
-- 
cgit v1.2.3