Merge tag 'powerpc-4.12-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux

Pull powerpc updates from Michael Ellerman: "Highlights include: - Larger virtual address space on 64-bit server CPUs. By default we use a 128TB virtual address space, but a process can request access to the full 512TB by passing a hint to mmap(). - Support for the new Power9 "XIVE" interrupt controller. - TLB flushing optimisations for the radix MMU on Power9. - Support for CAPI cards on Power9, using the "Coherent Accelerator Interface Architecture 2.0". - The ability to configure the mmap randomisation limits at build and runtime. - Several small fixes and cleanups to the kprobes code, as well as support for KPROBES_ON_FTRACE. - Major improvements to handling of system reset interrupts, correctly treating them as NMIs, giving them a dedicated stack and using a new hypervisor call to trigger them, all of which should aid debugging and robustness. - Many fixes and other minor enhancements. Thanks to: Alastair D'Silva, Alexey Kardashevskiy, Alistair Popple, Andrew Donnellan, Aneesh Kumar K.V, Anshuman Khandual, Anton Blanchard, Balbir Singh, Ben Hutchings, Benjamin Herrenschmidt, Bhupesh Sharma, Chris Packham, Christian Zigotzky, Christophe Leroy, Christophe Lombard, Daniel Axtens, David Gibson, Gautham R. Shenoy, Gavin Shan, Geert Uytterhoeven, Guilherme G. Piccoli, Hamish Martin, Hari Bathini, Kees Cook, Laurent Dufour, Madhavan Srinivasan, Mahesh J Salgaonkar, Mahesh Salgaonkar, Masami Hiramatsu, Matt Brown, Matthew R. Ochs, Michael Neuling, Naveen N. Rao, Nicholas Piggin, Oliver O'Halloran, Pan Xinhui, Paul Mackerras, Rashmica Gupta, Russell Currey, Sukadev Bhattiprolu, Thadeu Lima de Souza Cascardo, Tobin C. Harding, Tyrel Datwyler, Uma Krishnan, Vaibhav Jain, Vipin K Parashar, Yang Shi" * tag 'powerpc-4.12-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (214 commits) powerpc/64s: Power9 has no LPCR[VRMASD] field so don't set it powerpc/powernv: Fix TCE kill on NVLink2 powerpc/mm/radix: Drop support for CPUs without lockless tlbie powerpc/book3s/mce: Move add_taint() later in virtual mode powerpc/sysfs: Move #ifdef CONFIG_HOTPLUG_CPU out of the function body powerpc/smp: Document irq enable/disable after migrating IRQs powerpc/mpc52xx: Don't select user-visible RTAS_PROC powerpc/powernv: Document cxl dependency on special case in pnv_eeh_reset() powerpc/eeh: Clean up and document event handling functions powerpc/eeh: Avoid use after free in eeh_handle_special_event() cxl: Mask slice error interrupts after first occurrence cxl: Route eeh events to all drivers in cxl_pci_error_detected() cxl: Force context lock during EEH flow powerpc/64: Allow CONFIG_RELOCATABLE if COMPILE_TEST powerpc/xmon: Teach xmon oops about radix vectors powerpc/mm/hash: Fix off-by-one in comment about kernel contexts ids powerpc/pseries: Enable VFIO powerpc/powernv: Fix iommu table size calculation hook for small tables powerpc/powernv: Check kzalloc() return value in pnv_pci_table_alloc powerpc: Add arch/powerpc/tools directory ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2017-05-05 11:36:44 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2017-05-05 11:36:44 -0700
commit: 7246f60068840847bdcf595be5f0b5ca632736e0 (patch)
tree: fd9a963a03c2655f3ba9d1ced3c87a2775f5b166
parent: e579dde654fc2c6b0d3e4b77a9a4b2d2405c510e (diff)
parent: 700b7eadd5625d22b8235fb21259b3d7d564c000 (diff)
download: lwn-7246f60068840847bdcf595be5f0b5ca632736e0.tar.gz
lwn-7246f60068840847bdcf595be5f0b5ca632736e0.zip
212 files changed, 8698 insertions, 2790 deletions
diff --git a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt
index f9133a921d5a..1e84be3c142e 100644
--- a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt
+++ b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt
@@ -26,7 +26,7 @@
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
-    |     powerpc: | TODO |
+    |     powerpc: |  ok  |
     |        s390: | TODO |
     |       score: | TODO |
     |          sh: | TODO |
diff --git a/Documentation/powerpc/cxl.txt b/Documentation/powerpc/cxl.txt
index d5506ba0fef7..c5e8d5098ed3 100644
--- a/Documentation/powerpc/cxl.txt
+++ b/Documentation/powerpc/cxl.txt
@@ -21,7 +21,7 @@ Introduction
 Hardware overview
 =================
 
-          POWER8               FPGA
+         POWER8/9             FPGA
        +----------+        +---------+
        |          |        |         |
        |   CPU    |        |   AFU   |
@@ -34,7 +34,7 @@ Hardware overview
        |   | CAPP |<------>|         |
        +---+------+  PCIE  +---------+
 
-    The POWER8 chip has a Coherently Attached Processor Proxy (CAPP)
+    The POWER8/9 chip has a Coherently Attached Processor Proxy (CAPP)
     unit which is part of the PCIe Host Bridge (PHB). This is managed
     by Linux by calls into OPAL. Linux doesn't directly program the
     CAPP.
@@ -59,6 +59,17 @@ Hardware overview
     the fault. The context to which this fault is serviced is based on
     who owns that acceleration function.
 
+    POWER8 <-----> PSL Version 8 is compliant to the CAIA Version 1.0.
+    POWER9 <-----> PSL Version 9 is compliant to the CAIA Version 2.0.
+    This PSL Version 9 provides new features such as:
+    * Interaction with the nest MMU on the P9 chip.
+    * Native DMA support.
+    * Supports sending ASB_Notify messages for host thread wakeup.
+    * Supports Atomic operations.
+    * ....
+
+    Cards with a PSL9 won't work on a POWER8 system and cards with a
+    PSL8 won't work on a POWER9 system.
 
 AFU Modes
 =========
diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt
index 3007bc98af28..19b1e3d09a19 100644
--- a/Documentation/powerpc/firmware-assisted-dump.txt
+++ b/Documentation/powerpc/firmware-assisted-dump.txt
@@ -105,21 +105,21 @@ memory is held.
 
 If there is no waiting dump data, then only the memory required
 to hold CPU state, HPTE region, boot memory dump and elfcore
-header, is reserved at the top of memory (see Fig. 1). This area
-is *not* released: this region will be kept permanently reserved,
-so that it can act as a receptacle for a copy of the boot memory
-content in addition to CPU state and HPTE region, in the case a
-crash does occur.
+header, is usually reserved at an offset greater than boot memory
+size (see Fig. 1). This area is *not* released: this region will
+be kept permanently reserved, so that it can act as a receptacle
+for a copy of the boot memory content in addition to CPU state
+and HPTE region, in the case a crash does occur.
 
   o Memory Reservation during first kernel
 
-  Low memory                                        Top of memory
+  Low memory                                         Top of memory
   0      boot memory size                                       |
-  |           |                       |<--Reserved dump area -->|
-  V           V                       |   Permanent Reservation V
-  +-----------+----------/ /----------+---+----+-----------+----+
-  |           |                       |CPU|HPTE|  DUMP     |ELF |
-  +-----------+----------/ /----------+---+----+-----------+----+
+  |           |                |<--Reserved dump area -->|      |
+  V           V                |   Permanent Reservation |      V
+  +-----------+----------/ /---+---+----+-----------+----+------+
+  |           |                |CPU|HPTE|  DUMP     |ELF |      |
+  +-----------+----------/ /---+---+----+-----------+----+------+
         |                                           ^
         |                                           |
         \                                           /
@@ -135,12 +135,12 @@ crash does occur.
   0      boot memory size                                       |
   |           |<------------- Reserved dump area ----------- -->|
   V           V                                                 V
-  +-----------+----------/ /----------+---+----+-----------+----+
-  |           |                       |CPU|HPTE|  DUMP     |ELF |
-  +-----------+----------/ /----------+---+----+-----------+----+
-        |                                                    |
-        V                                                    V
-   Used by second                                    /proc/vmcore
+  +-----------+----------/ /---+---+----+-----------+----+------+
+  |           |                |CPU|HPTE|  DUMP     |ELF |      |
+  +-----------+----------/ /---+---+----+-----------+----+------+
+        |                                              |
+        V                                              V
+   Used by second                                /proc/vmcore
    kernel to boot
                    Fig. 2
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 5ee3125f8341..b1f9b45f061f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5310,6 +5310,7 @@ M:	Scott Wood <oss@buserror.net>
 L:	linuxppc-dev@lists.ozlabs.org
 L:	linux-arm-kernel@lists.infradead.org
 S:	Maintained
+F:	Documentation/devicetree/bindings/powerpc/fsl/
 F:	drivers/soc/fsl/
 F:	include/linux/fsl/
 
@@ -7568,7 +7569,7 @@ Q:	http://patchwork.ozlabs.org/project/linuxppc-dev/list/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git
 S:	Supported
 F:	Documentation/ABI/stable/sysfs-firmware-opal-*
-F:	Documentation/devicetree/bindings/powerpc/opal/
+F:	Documentation/devicetree/bindings/powerpc/
 F:	Documentation/devicetree/bindings/rtc/rtc-opal.txt
 F:	Documentation/devicetree/bindings/i2c/i2c-opal.txt
 F:	Documentation/powerpc/
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 053382616533..f07f727cbfd2 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -22,6 +22,48 @@ config MMU
 	bool
 	default y
 
+config ARCH_MMAP_RND_BITS_MAX
+	# On Book3S 64, the default virtual address space for 64-bit processes
+	# is 2^47 (128TB). As a maximum, allow randomisation to consume up to
+	# 32T of address space (2^45), which should ensure a reasonable gap
+	# between bottom-up and top-down allocations for applications that
+	# consume "normal" amounts of address space. Book3S 64 only supports 64K
+	# and 4K page sizes.
+	default 29 if PPC_BOOK3S_64 && PPC_64K_PAGES # 29 = 45 (32T) - 16 (64K)
+	default 33 if PPC_BOOK3S_64		     # 33 = 45 (32T) - 12 (4K)
+	#
+	# On all other 64-bit platforms (currently only Book3E), the virtual
+	# address space is 2^46 (64TB). Allow randomisation to consume up to 16T
+	# of address space (2^44). Only 4K page sizes are supported.
+	default 32 if 64BIT	# 32 = 44 (16T) - 12 (4K)
+	#
+	# For 32-bit, use the compat values, as they're the same.
+	default ARCH_MMAP_RND_COMPAT_BITS_MAX
+
+config ARCH_MMAP_RND_BITS_MIN
+	# Allow randomisation to consume up to 1GB of address space (2^30).
+	default 14 if 64BIT && PPC_64K_PAGES	# 14 = 30 (1GB) - 16 (64K)
+	default 18 if 64BIT			# 18 = 30 (1GB) - 12 (4K)
+	#
+	# For 32-bit, use the compat values, as they're the same.
+	default ARCH_MMAP_RND_COMPAT_BITS_MIN
+
+config ARCH_MMAP_RND_COMPAT_BITS_MAX
+	# Total virtual address space for 32-bit processes is 2^31 (2GB).
+	# Allow randomisation to consume up to 512MB of address space (2^29).
+	default 11 if PPC_256K_PAGES	# 11 = 29 (512MB) - 18 (256K)
+	default 13 if PPC_64K_PAGES	# 13 = 29 (512MB) - 16 (64K)
+	default 15 if PPC_16K_PAGES 	# 15 = 29 (512MB) - 14 (16K)
+	default 17			# 17 = 29 (512MB) - 12 (4K)
+
+config ARCH_MMAP_RND_COMPAT_BITS_MIN
+	# Total virtual address space for 32-bit processes is 2^31 (2GB).
+	# Allow randomisation to consume up to 8MB of address space (2^23).
+	default 5 if PPC_256K_PAGES	#  5 = 23 (8MB) - 18 (256K)
+	default 7 if PPC_64K_PAGES	#  7 = 23 (8MB) - 16 (64K)
+	default 9 if PPC_16K_PAGES	#  9 = 23 (8MB) - 14 (16K)
+	default 11			# 11 = 23 (8MB) - 12 (4K)
+
 config HAVE_SETUP_PER_CPU_AREA
 	def_bool PPC64
 
@@ -38,6 +80,11 @@ config NR_IRQS
 	  /proc/interrupts. If you configure your system to have too few,
 	  drivers will fail to load or worse - handle with care.
 
+config NMI_IPI
+	bool
+	depends on SMP && (DEBUGGER || KEXEC_CORE)
+	default y
+
 config STACKTRACE_SUPPORT
 	bool
 	default y
@@ -119,6 +166,8 @@ config PPC
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_ARCH_KGDB
+	select HAVE_ARCH_MMAP_RND_BITS
+	select HAVE_ARCH_MMAP_RND_COMPAT_BITS	if COMPAT
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_CBPF_JIT			if !PPC64
@@ -141,6 +190,7 @@ config PPC
 	select HAVE_IRQ_EXIT_ON_IRQ_STACK
 	select HAVE_KERNEL_GZIP
 	select HAVE_KPROBES
+	select HAVE_KPROBES_ON_FTRACE
 	select HAVE_KRETPROBES
 	select HAVE_LIVEPATCH			if HAVE_DYNAMIC_FTRACE_WITH_REGS
 	select HAVE_MEMBLOCK
@@ -489,7 +539,7 @@ config KEXEC_FILE
 
 config RELOCATABLE
 	bool "Build a relocatable kernel"
-	depends on (PPC64 && !COMPILE_TEST) || (FLATMEM && (44x || FSL_BOOKE))
+	depends on PPC64 || (FLATMEM && (44x || FSL_BOOKE))
 	select NONSTATIC_KERNEL
 	select MODULE_REL_CRCS if MODVERSIONS
 	help
@@ -523,7 +573,7 @@ config RELOCATABLE_TEST
 config CRASH_DUMP
 	bool "Build a kdump crash kernel"
 	depends on PPC64 || 6xx || FSL_BOOKE || (44x && !SMP)
-	select RELOCATABLE if (PPC64 && !COMPILE_TEST) || 44x || FSL_BOOKE
+	select RELOCATABLE if PPC64 || 44x || FSL_BOOKE
 	help
 	  Build a kernel suitable for use as a kdump capture kernel.
 	  The same kernel binary can be used as production kernel and dump
@@ -585,7 +635,7 @@ config ARCH_SPARSEMEM_ENABLE
 
 config ARCH_SPARSEMEM_DEFAULT
 	def_bool y
-	depends on (SMP && PPC_PSERIES) || PPC_PS3
+	depends on PPC_BOOK3S_64
 
 config SYS_SUPPORTS_HUGETLBFS
 	bool
@@ -677,6 +727,16 @@ config PPC_256K_PAGES
 
 endchoice
 
+config THREAD_SHIFT
+	int "Thread shift" if EXPERT
+	range 13 15
+	default "15" if PPC_256K_PAGES
+	default "14" if PPC64
+	default "13"
+	help
+	  Used to define the stack size. The default is almost always what you
+	  want. Only change this if you know what you are doing.
+
 config FORCE_MAX_ZONEORDER
 	int "Maximum zone order"
 	range 8 9 if PPC64 && PPC_64K_PAGES
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 19b0d1a81959..3e0f0e1fadef 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -136,7 +136,7 @@ CFLAGS-$(CONFIG_GENERIC_CPU) += -mcpu=powerpc64
 endif
 
 ifdef CONFIG_MPROFILE_KERNEL
-    ifeq ($(shell $(srctree)/arch/powerpc/scripts/gcc-check-mprofile-kernel.sh $(CC) -I$(srctree)/include -D__KERNEL__),OK)
+    ifeq ($(shell $(srctree)/arch/powerpc/tools/gcc-check-mprofile-kernel.sh $(CC) -I$(srctree)/include -D__KERNEL__),OK)
         CC_FLAGS_FTRACE := -pg -mprofile-kernel
         KBUILD_CPPFLAGS += -DCC_USING_MPROFILE_KERNEL
     else
@@ -274,17 +274,6 @@ PHONY += $(BOOT_TARGETS1) $(BOOT_TARGETS2)
 
 boot := arch/$(ARCH)/boot
 
-ifeq ($(CONFIG_RELOCATABLE),y)
-quiet_cmd_relocs_check = CALL    $<
-      cmd_relocs_check = $(CONFIG_SHELL) $< "$(OBJDUMP)" "$(obj)/vmlinux"
-
-PHONY += relocs_check
-relocs_check: arch/powerpc/relocs_check.sh vmlinux
-	$(call cmd,relocs_check)
-
-zImage: relocs_check
-endif
-
 $(BOOT_TARGETS1): vmlinux
 	$(Q)$(MAKE) $(build)=$(boot) $(patsubst %,$(boot)/%,$@)
 $(BOOT_TARGETS2): vmlinux
diff --git a/arch/powerpc/Makefile.postlink b/arch/powerpc/Makefile.postlink
new file mode 100644
index 000000000000..3c22d64b2de9
--- /dev/null
+++ b/arch/powerpc/Makefile.postlink
@@ -0,0 +1,34 @@
+# ===========================================================================
+# Post-link powerpc pass
+# ===========================================================================
+#
+# 1. Check that vmlinux relocations look sane
+
+PHONY := __archpost
+__archpost:
+
+include include/config/auto.conf
+include scripts/Kbuild.include
+
+quiet_cmd_relocs_check = CHKREL  $@
+      cmd_relocs_check = $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh "$(OBJDUMP)" "$@"
+
+# `@true` prevents complaint when there is nothing to be done
+
+vmlinux: FORCE
+	@true
+ifdef CONFIG_RELOCATABLE
+	$(call if_changed,relocs_check)
+endif
+
+%.ko: FORCE
+	@true
+
+clean:
+	@true
+
+PHONY += FORCE clean
+
+FORCE:
+
+.PHONY: $(PHONY)
diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig
index ac8b8332ed82..0695ce047d56 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -33,7 +33,7 @@ CONFIG_BLK_DEV_INITRD=y
 CONFIG_BPF_SYSCALL=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
-CONFIG_OPROFILE=y
+CONFIG_OPROFILE=m
 CONFIG_KPROBES=y
 CONFIG_JUMP_LABEL=y
 CONFIG_MODULES=y
@@ -261,7 +261,7 @@ CONFIG_NILFS2_FS=m
 CONFIG_AUTOFS4_FS=m
 CONFIG_FUSE_FS=m
 CONFIG_OVERLAY_FS=m
-CONFIG_ISO9660_FS=m
+CONFIG_ISO9660_FS=y
 CONFIG_UDF_FS=m
 CONFIG_MSDOS_FS=y
 CONFIG_VFAT_FS=m
@@ -306,7 +306,7 @@ CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_CCM=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
-CONFIG_CRYPT_CRC32C_VPMSUM=m
+CONFIG_CRYPTO_CRC32C_VPMSUM=m
 CONFIG_CRYPTO_MD5_PPC=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_SHA256=y
diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
index f2e03f032041..5175028c56ce 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -19,7 +19,7 @@ CONFIG_BLK_DEV_INITRD=y
 CONFIG_BPF_SYSCALL=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
-CONFIG_OPROFILE=y
+CONFIG_OPROFILE=m
 CONFIG_KPROBES=y
 CONFIG_JUMP_LABEL=y
 CONFIG_MODULES=y
@@ -290,7 +290,7 @@ CONFIG_NILFS2_FS=m
 CONFIG_AUTOFS4_FS=m
 CONFIG_FUSE_FS=m
 CONFIG_OVERLAY_FS=m
-CONFIG_ISO9660_FS=m
+CONFIG_ISO9660_FS=y
 CONFIG_UDF_FS=m
 CONFIG_MSDOS_FS=y
 CONFIG_VFAT_FS=m
@@ -339,7 +339,7 @@ CONFIG_PPC_EARLY_DEBUG=y
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
-CONFIG_CRYPT_CRC32C_VPMSUM=m
+CONFIG_CRYPTO_CRC32C_VPMSUM=m
 CONFIG_CRYPTO_MD5_PPC=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_SHA256=y
diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig
index 4ff68b752618..1a61aa20dfba 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -34,7 +34,7 @@ CONFIG_BLK_DEV_INITRD=y
 CONFIG_BPF_SYSCALL=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
-CONFIG_OPROFILE=y
+CONFIG_OPROFILE=m
 CONFIG_KPROBES=y
 CONFIG_JUMP_LABEL=y
 CONFIG_MODULES=y
@@ -259,7 +259,7 @@ CONFIG_NILFS2_FS=m
 CONFIG_AUTOFS4_FS=m
 CONFIG_FUSE_FS=m
 CONFIG_OVERLAY_FS=m
-CONFIG_ISO9660_FS=m
+CONFIG_ISO9660_FS=y
 CONFIG_UDF_FS=m
 CONFIG_MSDOS_FS=y
 CONFIG_VFAT_FS=m
@@ -303,7 +303,7 @@ CONFIG_XMON=y
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
-CONFIG_CRYPT_CRC32C_VPMSUM=m
+CONFIG_CRYPTO_CRC32C_VPMSUM=m
 CONFIG_CRYPTO_MD5_PPC=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_SHA256=y
diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index f6c5264287e5..7330150bfe34 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -17,6 +17,8 @@
 #include <asm/checksum.h>
 #include <linux/uaccess.h>
 #include <asm/epapr_hcalls.h>
+#include <asm/dcr.h>
+#include <asm/mmu_context.h>
 
 #include <uapi/asm/ucontext.h>
 
@@ -120,6 +122,8 @@ extern s64 __ashrdi3(s64, int);
 extern int __cmpdi2(s64, s64);
 extern int __ucmpdi2(u64, u64);
 
+/* tracing */
 void _mcount(void);
+unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip);
 
 #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */
diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
index bc5fdfd22788..33a24fdd7958 100644
--- a/arch/powerpc/include/asm/bitops.h
+++ b/arch/powerpc/include/asm/bitops.h
@@ -55,6 +55,14 @@
 #define PPC_BITEXTRACT(bits, ppc_bit, dst_bit)			\
 	((((bits) >> PPC_BITLSHIFT(ppc_bit)) & 1) << (dst_bit))
 
+#define PPC_BITLSHIFT32(be)	(32 - 1 - (be))
+#define PPC_BIT32(bit)		(1UL << PPC_BITLSHIFT32(bit))
+#define PPC_BITMASK32(bs, be)	((PPC_BIT32(bs) - PPC_BIT32(be))|PPC_BIT32(bs))
+
+#define PPC_BITLSHIFT8(be)	(8 - 1 - (be))
+#define PPC_BIT8(bit)		(1UL << PPC_BITLSHIFT8(bit))
+#define PPC_BITMASK8(bs, be)	((PPC_BIT8(bs) - PPC_BIT8(be))|PPC_BIT8(bs))
+
 #include <asm/barrier.h>
 
 /* Macro for generating the ***_bits() functions */
diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index 0c4e470571ca..b4b5e6b671ca 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -8,7 +8,7 @@
 #define H_PTE_INDEX_SIZE  9
 #define H_PMD_INDEX_SIZE  7
 #define H_PUD_INDEX_SIZE  9
-#define H_PGD_INDEX_SIZE  9
+#define H_PGD_INDEX_SIZE  12
 
 #ifndef __ASSEMBLY__
 #define H_PTE_TABLE_SIZE	(sizeof(pte_t) << H_PTE_INDEX_SIZE)
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index f3dd21efa2ea..214219dff87c 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -4,10 +4,14 @@
 #define H_PTE_INDEX_SIZE  8
 #define H_PMD_INDEX_SIZE  5
 #define H_PUD_INDEX_SIZE  5
-#define H_PGD_INDEX_SIZE  12
+#define H_PGD_INDEX_SIZE  15
 
-#define H_PAGE_COMBO	0x00001000 /* this is a combo 4k page */
-#define H_PAGE_4K_PFN	0x00002000 /* PFN is for a single 4k page */
+/*
+ * 64k aligned address free up few of the lower bits of RPN for us
+ * We steal that here. For more deatils look at pte_pfn/pfn_pte()
+ */
+#define H_PAGE_COMBO	_RPAGE_RPN0 /* this is a combo 4k page */
+#define H_PAGE_4K_PFN	_RPAGE_RPN1 /* PFN is for a single 4k page */
 /*
  * We need to differentiate between explicit huge page and THP huge
  * page, since THP huge page also need to track real subpage details
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index f7b721bbf918..4e957b027fe0 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -6,19 +6,13 @@
  * Common bits between 4K and 64K pages in a linux-style PTE.
  * Additional bits may be defined in pgtable-hash64-*.h
  *
- * Note: We only support user read/write permissions. Supervisor always
- * have full read/write to pages above PAGE_OFFSET (pages below that
- * always use the user access permissions).
- *
- * We could create separate kernel read-only if we used the 3 PP bits
- * combinations that newer processors provide but we currently don't.
  */
-#define H_PAGE_BUSY		0x00800 /* software: PTE & hash are busy */
 #define H_PTE_NONE_MASK		_PAGE_HPTEFLAGS
-#define H_PAGE_F_GIX_SHIFT	57
-#define H_PAGE_F_GIX		(7ul << 57)	/* HPTE index within HPTEG */
-#define H_PAGE_F_SECOND		(1ul << 60)	/* HPTE is in 2ndary HPTEG */
-#define H_PAGE_HASHPTE		(1ul << 61)	/* PTE has associated HPTE */
+#define H_PAGE_F_GIX_SHIFT	56
+#define H_PAGE_BUSY		_RPAGE_RSV1 /* software: PTE & hash are busy */
+#define H_PAGE_F_SECOND		_RPAGE_RSV2	/* HPTE is in 2ndary HPTEG */
+#define H_PAGE_F_GIX		(_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44)
+#define H_PAGE_HASHPTE		_RPAGE_RPN43	/* PTE has associated HPTE */
 
 #ifdef CONFIG_PPC_64K_PAGES
 #include <asm/book3s/64/hash-64k.h>
diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h
index c62f14d0bec1..6666cd366596 100644
--- a/arch/powerpc/include/asm/book3s/64/hugetlb.h
+++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
@@ -46,7 +46,7 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
 	 */
 	VM_WARN_ON(page_shift == mmu_psize_defs[MMU_PAGE_1G].shift);
 	if (page_shift == mmu_psize_defs[MMU_PAGE_2M].shift)
-		return __pte(pte_val(entry) | _PAGE_LARGE);
+		return __pte(pte_val(entry) | R_PAGE_LARGE);
 	else
 		return entry;
 }
diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 52d8d1e4b772..6981a52b3887 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -39,6 +39,7 @@
 
 /* Bits in the SLB VSID word */
 #define SLB_VSID_SHIFT		12
+#define SLB_VSID_SHIFT_256M	SLB_VSID_SHIFT
 #define SLB_VSID_SHIFT_1T	24
 #define SLB_VSID_SSIZE_SHIFT	62
 #define SLB_VSID_B		ASM_CONST(0xc000000000000000)
@@ -408,7 +409,7 @@ static inline unsigned long hpt_vpn(unsigned long ea,
 static inline unsigned long hpt_hash(unsigned long vpn,
 				     unsigned int shift, int ssize)
 {
-	int mask;
+	unsigned long mask;
 	unsigned long hash, vsid;
 
 	/* VPN_SHIFT can be atmost 12 */
@@ -491,13 +492,14 @@ extern void slb_set_size(u16 size);
  * We first generate a 37-bit "proto-VSID". Proto-VSIDs are generated
  * from mmu context id and effective segment id of the address.
  *
- * For user processes max context id is limited to ((1ul << 19) - 5)
- * for kernel space, we use the top 4 context ids to map address as below
+ * For user processes max context id is limited to MAX_USER_CONTEXT.
+
+ * For kernel space, we use context ids 1-4 to map addresses as below:
  * NOTE: each context only support 64TB now.
- * 0x7fffc -  [ 0xc000000000000000 - 0xc0003fffffffffff ]
- * 0x7fffd -  [ 0xd000000000000000 - 0xd0003fffffffffff ]
- * 0x7fffe -  [ 0xe000000000000000 - 0xe0003fffffffffff ]
- * 0x7ffff -  [ 0xf000000000000000 - 0xf0003fffffffffff ]
+ * 0x00001 -  [ 0xc000000000000000 - 0xc0003fffffffffff ]
+ * 0x00002 -  [ 0xd000000000000000 - 0xd0003fffffffffff ]
+ * 0x00003 -  [ 0xe000000000000000 - 0xe0003fffffffffff ]
+ * 0x00004 -  [ 0xf000000000000000 - 0xf0003fffffffffff ]
  *
  * The proto-VSIDs are then scrambled into real VSIDs with the
  * multiplicative hash:
@@ -511,20 +513,28 @@ extern void slb_set_size(u16 size);
  * robust scattering in the hash table (at least based on some initial
  * results).
  *
- * We also consider VSID 0 special. We use VSID 0 for slb entries mapping
- * bad address. This enables us to consolidate bad address handling in
- * hash_page.
+ * We use VSID 0 to indicate an invalid VSID. The means we can't use context id
+ * 0, because a context id of 0 and an EA of 0 gives a proto-VSID of 0, which
+ * will produce a VSID of 0.
  *
  * We also need to avoid the last segment of the last context, because that
  * would give a protovsid of 0x1fffffffff. That will result in a VSID 0
- * because of the modulo operation in vsid scramble. But the vmemmap
- * (which is what uses region 0xf) will never be close to 64TB in size
- * (it's 56 bytes per page of system memory).
+ * because of the modulo operation in vsid scramble.
  */
 
+/*
+ * Max Va bits we support as of now is 68 bits. We want 19 bit
+ * context ID.
+ * Restrictions:
+ * GPU has restrictions of not able to access beyond 128TB
+ * (47 bit effective address). We also cannot do more than 20bit PID.
+ * For p4 and p5 which can only do 65 bit VA, we restrict our CONTEXT_BITS
+ * to 16 bits (ie, we can only have 2^16 pids at the same time).
+ */
+#define VA_BITS			68
 #define CONTEXT_BITS		19
-#define ESID_BITS		18
-#define ESID_BITS_1T		6
+#define ESID_BITS		(VA_BITS - (SID_SHIFT + CONTEXT_BITS))
+#define ESID_BITS_1T		(VA_BITS - (SID_SHIFT_1T + CONTEXT_BITS))
 
 #define ESID_BITS_MASK		((1 << ESID_BITS) - 1)
 #define ESID_BITS_1T_MASK	((1 << ESID_BITS_1T) - 1)
@@ -532,63 +542,70 @@ extern void slb_set_size(u16 size);
 /*
  * 256MB segment
  * The proto-VSID space has 2^(CONTEX_BITS + ESID_BITS) - 1 segments
- * available for user + kernel mapping. The top 4 contexts are used for
- * kernel mapping. Each segment contains 2^28 bytes. Each
- * context maps 2^46 bytes (64TB) so we can support 2^19-1 contexts
- * (19 == 37 + 28 - 46).
+ * available for user + kernel mapping. VSID 0 is reserved as invalid, contexts
+ * 1-4 are used for kernel mapping. Each segment contains 2^28 bytes. Each
+ * context maps 2^49 bytes (512TB).
+ *
+ * We also need to avoid the last segment of the last context, because that
+ * would give a protovsid of 0x1fffffffff. That will result in a VSID 0
+ * because of the modulo operation in vsid scramble.
+ */
+#define MAX_USER_CONTEXT	((ASM_CONST(1) << CONTEXT_BITS) - 2)
+#define MIN_USER_CONTEXT	(5)
+
+/* Would be nice to use KERNEL_REGION_ID here */
+#define KERNEL_REGION_CONTEXT_OFFSET	(0xc - 1)
+
+/*
+ * For platforms that support on 65bit VA we limit the context bits
  */
-#define MAX_USER_CONTEXT	((ASM_CONST(1) << CONTEXT_BITS) - 5)
+#define MAX_USER_CONTEXT_65BIT_VA ((ASM_CONST(1) << (65 - (SID_SHIFT + ESID_BITS))) - 2)
 
 /*
  * This should be computed such that protovosid * vsid_mulitplier
- * doesn't overflow 64 bits. It should also be co-prime to vsid_modulus
+ * doesn't overflow 64 bits. The vsid_mutliplier should also be
+ * co-prime to vsid_modulus. We also need to make sure that number
+ * of bits in multiplied result (dividend) is less than twice the number of
+ * protovsid bits for our modulus optmization to work.
+ *
+ * The below table shows the current values used.
+ * |-------+------------+----------------------+------------+-------------------|
+ * |       | Prime Bits | proto VSID_BITS_65VA | Total Bits | 2* prot VSID_BITS |
+ * |-------+------------+----------------------+------------+-------------------|
+ * | 1T    |         24 |                   25 |         49 |                50 |
+ * |-------+------------+----------------------+------------+-------------------|
+ * | 256MB |         24 |                   37 |         61 |                74 |
+ * |-------+------------+----------------------+------------+-------------------|
+ *
+ * |-------+------------+----------------------+------------+--------------------|
+ * |       | Prime Bits | proto VSID_BITS_68VA | Total Bits | 2* proto VSID_BITS |
+ * |-------+------------+----------------------+------------+--------------------|
+ * | 1T    |         24 |                   28 |         52 |                 56 |
+ * |-------+------------+----------------------+------------+--------------------|
+ * | 256MB |         24 |                   40 |         64 |                 80 |
+ * |-------+------------+----------------------+------------+--------------------|
+ *
  */
 #define VSID_MULTIPLIER_256M	ASM_CONST(12538073)	/* 24-bit prime */
-#define VSID_BITS_256M		(CONTEXT_BITS + ESID_BITS)
-#define VSID_MODULUS_256M	((1UL<<VSID_BITS_256M)-1)
+#define VSID_BITS_256M		(VA_BITS - SID_SHIFT)
+#define VSID_BITS_65_256M	(65 - SID_SHIFT)
+/*
+ * Modular multiplicative inverse of VSID_MULTIPLIER under modulo VSID_MODULUS
+ */
+#define VSID_MULINV_256M	ASM_CONST(665548017062)
 
 #define VSID_MULTIPLIER_1T	ASM_CONST(12538073)	/* 24-bit prime */
-#define VSID_BITS_1T		(CONTEXT_BITS + ESID_BITS_1T)
-#define VSID_MODULUS_1T		((1UL<<VSID_BITS_1T)-1)
-
+#define VSID_BITS_1T		(VA_BITS - SID_SHIFT_1T)
+#define VSID_BITS_65_1T		(65 - SID_SHIFT_1T)
+#define VSID_MULINV_1T		ASM_CONST(209034062)
 
+/* 1TB VSID reserved for VRMA */
+#define VRMA_VSID	0x1ffffffUL
 #define USER_VSID_RANGE	(1UL << (ESID_BITS + SID_SHIFT))
 
-/*
- * This macro generates asm code to compute the VSID scramble
- * function.  Used in slb_allocate() and do_stab_bolted.  The function
- * computed is: (protovsid*VSID_MULTIPLIER) % VSID_MODULUS
- *
- *	rt = register containing the proto-VSID and into which the
- *		VSID will be stored
- *	rx = scratch register (clobbered)
- *
- * 	- rt and rx must be different registers
- * 	- The answer will end up in the low VSID_BITS bits of rt.  The higher
- * 	  bits may contain other garbage, so you may need to mask the
- * 	  result.
- */
-#define ASM_VSID_SCRAMBLE(rt, rx, size)					\
-	lis	rx,VSID_MULTIPLIER_##size@h;				\
-	ori	rx,rx,VSID_MULTIPLIER_##size@l;				\
-	mulld	rt,rt,rx;		/* rt = rt * MULTIPLIER */	\
-									\
-	srdi	rx,rt,VSID_BITS_##size;					\
-	clrldi	rt,rt,(64-VSID_BITS_##size);				\
-	add	rt,rt,rx;		/* add high and low bits */	\
-	/* NOTE: explanation based on VSID_BITS_##size = 36		\
-	 * Now, r3 == VSID (mod 2^36-1), and lies between 0 and		\
-	 * 2^36-1+2^28-1.  That in particular means that if r3 >=	\
-	 * 2^36-1, then r3+1 has the 2^36 bit set.  So, if r3+1 has	\
-	 * the bit clear, r3 already has the answer we want, if it	\
-	 * doesn't, the answer is the low 36 bits of r3+1.  So in all	\
-	 * cases the answer is the low 36 bits of (r3 + ((r3+1) >> 36))*/\
-	addi	rx,rt,1;						\
-	srdi	rx,rx,VSID_BITS_##size;	/* extract 2^VSID_BITS bit */	\
-	add	rt,rt,rx
-
 /* 4 bits per slice and we have one slice per 1TB */
-#define SLICE_ARRAY_SIZE  (H_PGTABLE_RANGE >> 41)
+#define SLICE_ARRAY_SIZE	(H_PGTABLE_RANGE >> 41)
+#define TASK_SLICE_ARRAY_SZ(x)	((x)->context.addr_limit >> 41)
 
 #ifndef __ASSEMBLY__
 
@@ -634,7 +651,7 @@ static inline void subpage_prot_init_new_context(struct mm_struct *mm) { }
 #define vsid_scramble(protovsid, size) \
 	((((protovsid) * VSID_MULTIPLIER_##size) % VSID_MODULUS_##size))
 
-#else /* 1 */
+/* simplified form avoiding mod operation */
 #define vsid_scramble(protovsid, size) \
 	({								 \
 		unsigned long x;					 \
@@ -642,6 +659,21 @@ static inline void subpage_prot_init_new_context(struct mm_struct *mm) { }
 		x = (x >> VSID_BITS_##size) + (x & VSID_MODULUS_##size); \
 		(x + ((x+1) >> VSID_BITS_##size)) & VSID_MODULUS_##size; \
 	})
+
+#else /* 1 */
+static inline unsigned long vsid_scramble(unsigned long protovsid,
+				  unsigned long vsid_multiplier, int vsid_bits)
+{
+	unsigned long vsid;
+	unsigned long vsid_modulus = ((1UL << vsid_bits) - 1);
+	/*
+	 * We have same multipler for both 256 and 1T segements now
+	 */
+	vsid = protovsid * vsid_multiplier;
+	vsid = (vsid >> vsid_bits) + (vsid & vsid_modulus);
+	return (vsid + ((vsid + 1) >> vsid_bits)) & vsid_modulus;
+}
+
 #endif /* 1 */
 
 /* Returns the segment size indicator for a user address */
@@ -656,36 +688,56 @@ static inline int user_segment_size(unsigned long addr)
 static inline unsigned long get_vsid(unsigned long context, unsigned long ea,
 				     int ssize)
 {
+	unsigned long va_bits = VA_BITS;
+	unsigned long vsid_bits;
+	unsigned long protovsid;
+
 	/*
 	 * Bad address. We return VSID 0 for that
 	 */
 	if ((ea & ~REGION_MASK) >= H_PGTABLE_RANGE)
 		return 0;
 
-	if (ssize == MMU_SEGSIZE_256M)
-		return vsid_scramble((context << ESID_BITS)
-				     | ((ea >> SID_SHIFT) & ESID_BITS_MASK), 256M);
-	return vsid_scramble((context << ESID_BITS_1T)
-			     | ((ea >> SID_SHIFT_1T) & ESID_BITS_1T_MASK), 1T);
+	if (!mmu_has_feature(MMU_FTR_68_BIT_VA))
+		va_bits = 65;
+
+	if (ssize == MMU_SEGSIZE_256M) {
+		vsid_bits = va_bits - SID_SHIFT;
+		protovsid = (context << ESID_BITS) |
+			((ea >> SID_SHIFT) & ESID_BITS_MASK);
+		return vsid_scramble(protovsid, VSID_MULTIPLIER_256M, vsid_bits);
+	}
+	/* 1T segment */
+	vsid_bits = va_bits - SID_SHIFT_1T;
+	protovsid = (context << ESID_BITS_1T) |
+		((ea >> SID_SHIFT_1T) & ESID_BITS_1T_MASK);
+	return vsid_scramble(protovsid, VSID_MULTIPLIER_1T, vsid_bits);
 }
 
 /*
  * This is only valid for addresses >= PAGE_OFFSET
- *
- * For kernel space, we use the top 4 context ids to map address as below
- * 0x7fffc -  [ 0xc000000000000000 - 0xc0003fffffffffff ]
- * 0x7fffd -  [ 0xd000000000000000 - 0xd0003fffffffffff ]
- * 0x7fffe -  [ 0xe000000000000000 - 0xe0003fffffffffff ]
- * 0x7ffff -  [ 0xf000000000000000 - 0xf0003fffffffffff ]
  */
 static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize)
 {
 	unsigned long context;
 
+	if (!is_kernel_addr(ea))
+		return 0;
+
 	/*
-	 * kernel take the top 4 context from the available range
+	 * For kernel space, we use context ids 1-4 to map the address space as
+	 * below:
+	 *
+	 * 0x00001 -  [ 0xc000000000000000 - 0xc0003fffffffffff ]
+	 * 0x00002 -  [ 0xd000000000000000 - 0xd0003fffffffffff ]
+	 * 0x00003 -  [ 0xe000000000000000 - 0xe0003fffffffffff ]
+	 * 0x00004 -  [ 0xf000000000000000 - 0xf0003fffffffffff ]
+	 *
+	 * So we can compute the context from the region (top nibble) by
+	 * subtracting 11, or 0xc - 1.
 	 */
-	context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1;
+	context = (ea >> 60) - KERNEL_REGION_CONTEXT_OFFSET;
+
 	return get_vsid(context, ea, ssize);
 }
 
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index 805d4105e9bb..77529a3e3811 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -65,6 +65,8 @@ extern struct patb_entry *partition_tb;
  * MAX_USER_CONTEXT * 16 bytes of space.
  */
 #define PRTB_SIZE_SHIFT	(CONTEXT_BITS + 4)
+#define PRTB_ENTRIES	(1ul << CONTEXT_BITS)
+
 /*
  * Power9 currently only support 64K partition table size.
  */
@@ -73,13 +75,20 @@ extern struct patb_entry *partition_tb;
 typedef unsigned long mm_context_id_t;
 struct spinlock;
 
+/* Maximum possible number of NPUs in a system. */
+#define NV_MAX_NPUS 8
+
 typedef struct {
 	mm_context_id_t id;
 	u16 user_psize;		/* page size index */
 
+	/* NPU NMMU context */
+	struct npu_context *npu_context;
+
 #ifdef CONFIG_PPC_MM_SLICES
 	u64 low_slices_psize;	/* SLB page size encodings */
 	unsigned char high_slices_psize[SLICE_ARRAY_SIZE];
+	unsigned long addr_limit;
 #else
 	u16 sllp;		/* SLB page size encoding */
 #endif
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 8f4d41936e5a..85bc9875c3be 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -13,6 +13,7 @@
 #define _PAGE_BIT_SWAP_TYPE	0
 
 #define _PAGE_RO		0
+#define _PAGE_SHARED		0
 
 #define _PAGE_EXEC		0x00001 /* execute permission */
 #define _PAGE_WRITE		0x00002 /* write access allowed */
@@ -37,21 +38,47 @@
 #define _RPAGE_RSV3		0x0400000000000000UL
 #define _RPAGE_RSV4		0x0200000000000000UL
 
-#ifdef CONFIG_MEM_SOFT_DIRTY
-#define _PAGE_SOFT_DIRTY	_RPAGE_SW3 /* software: software dirty tracking */
-#else
-#define _PAGE_SOFT_DIRTY	0x00000
-#endif
-#define _PAGE_SPECIAL		_RPAGE_SW2 /* software: special page */
+#define _PAGE_PTE		0x4000000000000000UL	/* distinguishes PTEs from pointers */
+#define _PAGE_PRESENT		0x8000000000000000UL	/* pte contains a translation */
 
 /*
- * For P9 DD1 only, we need to track whether the pte's huge.
+ * Top and bottom bits of RPN which can be used by hash
+ * translation mode, because we expect them to be zero
+ * otherwise.
  */
-#define _PAGE_LARGE	_RPAGE_RSV1
+#define _RPAGE_RPN0		0x01000
+#define _RPAGE_RPN1		0x02000
+#define _RPAGE_RPN44		0x0100000000000000UL
+#define _RPAGE_RPN43		0x0080000000000000UL
+#define _RPAGE_RPN42		0x0040000000000000UL
+#define _RPAGE_RPN41		0x0020000000000000UL
+
+/* Max physical address bit as per radix table */
+#define _RPAGE_PA_MAX		57
 
+/*
+ * Max physical address bit we will use for now.
+ *
+ * This is mostly a hardware limitation and for now Power9 has
+ * a 51 bit limit.
+ *
+ * This is different from the number of physical bit required to address
+ * the last byte of memory. That is defined by MAX_PHYSMEM_BITS.
+ * MAX_PHYSMEM_BITS is a linux limitation imposed by the maximum
+ * number of sections we can support (SECTIONS_SHIFT).
+ *
+ * This is different from Radix page table limitation above and
+ * should always be less than that. The limit is done such that
+ * we can overload the bits between _RPAGE_PA_MAX and _PAGE_PA_MAX
+ * for hash linux page table specific bits.
+ *
+ * In order to be compatible with future hardware generations we keep
+ * some offsets and limit this for now to 53
+ */
+#define _PAGE_PA_MAX		53
 
-#define _PAGE_PTE		(1ul << 62)	/* distinguishes PTEs from pointers */
-#define _PAGE_PRESENT		(1ul << 63)	/* pte contains a translation */
+#define _PAGE_SOFT_DIRTY	_RPAGE_SW3 /* software: software dirty tracking */
+#define _PAGE_SPECIAL		_RPAGE_SW2 /* software: special page */
 /*
  * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE
  * Instead of fixing all of them, add an alternate define which
@@ -59,10 +86,11 @@
  */
 #define _PAGE_NO_CACHE		_PAGE_TOLERANT
 /*
- * We support 57 bit real address in pte. Clear everything above 57, and
- * every thing below PAGE_SHIFT;
+ * We support _RPAGE_PA_MAX bit real address in pte. On the linux side
+ * we are limited by _PAGE_PA_MAX. Clear everything above _PAGE_PA_MAX
+ * and every thing below PAGE_SHIFT;
  */
-#define PTE_RPN_MASK	(((1UL << 57) - 1) & (PAGE_MASK))
+#define PTE_RPN_MASK	(((1UL << _PAGE_PA_MAX) - 1) & (PAGE_MASK))
 /*
  * set of bits not changed in pmd_modify. Even though we have hash specific bits
  * in here, on radix we expect them to be zero.
@@ -205,10 +233,6 @@ extern unsigned long __pte_frag_nr;
 extern unsigned long __pte_frag_size_shift;
 #define PTE_FRAG_SIZE_SHIFT __pte_frag_size_shift
 #define PTE_FRAG_SIZE (1UL << PTE_FRAG_SIZE_SHIFT)
-/*
- * Pgtable size used by swapper, init in asm code
- */
-#define MAX_PGD_TABLE_SIZE (sizeof(pgd_t) << RADIX_PGD_INDEX_SIZE)
 
 #define PTRS_PER_PTE	(1 << PTE_INDEX_SIZE)
 #define PTRS_PER_PMD	(1 << PMD_INDEX_SIZE)
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index 9e0bb7cd6e22..ac16d1943022 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -11,6 +11,12 @@
 #include <asm/book3s/64/radix-4k.h>
 #endif
 
+/*
+ * For P9 DD1 only, we need to track whether the pte's huge.
+ */
+#define R_PAGE_LARGE	_RPAGE_RSV1
+
+
 #ifndef __ASSEMBLY__
 #include <asm/book3s/64/tlbflush-radix.h>
 #include <asm/cpu_has_feature.h>
@@ -252,7 +258,7 @@ static inline int radix__pmd_trans_huge(pmd_t pmd)
 static inline pmd_t radix__pmd_mkhuge(pmd_t pmd)
 {
 	if (cpu_has_feature(CPU_FTR_POWER9_DD1))
-		return __pmd(pmd_val(pmd) | _PAGE_PTE | _PAGE_LARGE);
+		return __pmd(pmd_val(pmd) | _PAGE_PTE | R_PAGE_LARGE);
 	return __pmd(pmd_val(pmd) | _PAGE_PTE);
 }
 static inline void radix__pmdp_huge_split_prepare(struct vm_area_struct *vma,
diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h
index 8ab937771068..abef812de7f8 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -12,6 +12,8 @@
 
 #include <asm/types.h>
 #include <asm/ppc-opcode.h>
+#include <linux/string.h>
+#include <linux/kallsyms.h>
 
 /* Flags for create_branch:
  * "b"   == create_branch(addr, target, 0);
@@ -99,6 +101,45 @@ static inline unsigned long ppc_global_function_entry(void *func)
 #endif
 }
 
+/*
+ * Wrapper around kallsyms_lookup() to return function entry address:
+ * - For ABIv1, we lookup the dot variant.
+ * - For ABIv2, we return the local entry point.
+ */
+static inline unsigned long ppc_kallsyms_lookup_name(const char *name)
+{
+	unsigned long addr;
+#ifdef PPC64_ELF_ABI_v1
+	/* check for dot variant */
+	char dot_name[1 + KSYM_NAME_LEN];
+	bool dot_appended = false;
+
+	if (strnlen(name, KSYM_NAME_LEN) >= KSYM_NAME_LEN)
+		return 0;
+
+	if (name[0] != '.') {
+		dot_name[0] = '.';
+		dot_name[1] = '\0';
+		strlcat(dot_name, name, sizeof(dot_name));
+		dot_appended = true;
+	} else {
+		dot_name[0] = '\0';
+		strlcat(dot_name, name, sizeof(dot_name));
+	}
+	addr = kallsyms_lookup_name(dot_name);
+	if (!addr && dot_appended)
+		/* Let's try the original non-dot symbol lookup	*/
+		addr = kallsyms_lookup_name(name);
+#elif defined(PPC64_ELF_ABI_v2)
+	addr = kallsyms_lookup_name(name);
+	if (addr)
+		addr = ppc_function_entry((void *)addr);
+#else
+	addr = kallsyms_lookup_name(name);
+#endif
+	return addr;
+}
+
 #ifdef CONFIG_PPC64
 /*
  * Some instruction encodings commonly used in dynamic ftracing
diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h
index 155731557c9b..52586f9956bb 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -2,13 +2,39 @@
 #define _ASM_POWERPC_CPUIDLE_H
 
 #ifdef CONFIG_PPC_POWERNV
-/* Used in powernv idle state management */
+/* Thread state used in powernv idle state management */
 #define PNV_THREAD_RUNNING              0
 #define PNV_THREAD_NAP                  1
 #define PNV_THREAD_SLEEP                2
 #define PNV_THREAD_WINKLE               3
-#define PNV_CORE_IDLE_LOCK_BIT          0x100
-#define PNV_CORE_IDLE_THREAD_BITS       0x0FF
+
+/*
+ * Core state used in powernv idle for POWER8.
+ *
+ * The lock bit synchronizes updates to the state, as well as parts of the
+ * sleep/wake code (see kernel/idle_book3s.S).
+ *
+ * Bottom 8 bits track the idle state of each thread. Bit is cleared before
+ * the thread executes an idle instruction (nap/sleep/winkle).
+ *
+ * Then there is winkle tracking. A core does not lose complete state
+ * until every thread is in winkle. So the winkle count field counts the
+ * number of threads in winkle (small window of false positives is okay
+ * around the sleep/wake, so long as there are no false negatives).
+ *
+ * When the winkle count reaches 8 (the COUNT_ALL_BIT becomes set), then
+ * the THREAD_WINKLE_BITS are set, which indicate which threads have not
+ * yet woken from the winkle state.
+ */
+#define PNV_CORE_IDLE_LOCK_BIT			0x10000000
+
+#define PNV_CORE_IDLE_WINKLE_COUNT		0x00010000
+#define PNV_CORE_IDLE_WINKLE_COUNT_ALL_BIT	0x00080000
+#define PNV_CORE_IDLE_WINKLE_COUNT_BITS		0x000F0000
+#define PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT	8
+#define PNV_CORE_IDLE_THREAD_WINKLE_BITS	0x0000FF00
+
+#define PNV_CORE_IDLE_THREAD_BITS       	0x000000FF
 
 /*
  * ============================ NOTE =================================
@@ -46,6 +72,7 @@ extern u32 pnv_fastsleep_workaround_at_exit[];
 
 extern u64 pnv_first_deep_stop_state;
 
+unsigned long pnv_cpu_offline(unsigned int cpu);
 int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags);
 static inline void report_invalid_psscr_val(u64 psscr_val, int err)
 {
diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index ab68d0ee7725..1f6847b107e4 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -471,10 +471,11 @@ enum {
 	    CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
 	    CPU_FTR_DSCR | CPU_FTR_SAO  | \
 	    CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
-	    CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
+	    CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
 	    CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_DAWR | \
 	    CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP | CPU_FTR_ARCH_300)
-#define CPU_FTRS_POWER9_DD1 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD1)
+#define CPU_FTRS_POWER9_DD1 ((CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD1) & \
+			     (~CPU_FTR_SAO))
 #define CPU_FTRS_CELL	(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
 	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
 	    CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h
index 378167377065..f70cbfe0ec04 100644
--- a/arch/powerpc/include/asm/dbell.h
+++ b/arch/powerpc/include/asm/dbell.h
@@ -35,33 +35,53 @@ enum ppc_dbell {
 #ifdef CONFIG_PPC_BOOK3S
 
 #define PPC_DBELL_MSGTYPE		PPC_DBELL_SERVER
-#define SPRN_DOORBELL_CPUTAG		SPRN_TIR
-#define PPC_DBELL_TAG_MASK		0x7f
 
 static inline void _ppc_msgsnd(u32 msg)
 {
-	if (cpu_has_feature(CPU_FTR_HVMODE))
-		__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
-	else
-		__asm__ __volatile__ (PPC_MSGSNDP(%0) : : "r" (msg));
+	__asm__ __volatile__ (ASM_FTR_IFSET(PPC_MSGSND(%1), PPC_MSGSNDP(%1), %0)
+				: : "i" (CPU_FTR_HVMODE), "r" (msg));
+}
+
+/* sync before sending message */
+static inline void ppc_msgsnd_sync(void)
+{
+	__asm__ __volatile__ ("sync" : : : "memory");
+}
+
+/* sync after taking message interrupt */
+static inline void ppc_msgsync(void)
+{
+	/* sync is not required when taking messages from the same core */
+	__asm__ __volatile__ (ASM_FTR_IFSET(PPC_MSGSYNC " ; lwsync", "", %0)
+				: : "i" (CPU_FTR_HVMODE|CPU_FTR_ARCH_300));
 }
 
 #else /* CONFIG_PPC_BOOK3S */
 
 #define PPC_DBELL_MSGTYPE		PPC_DBELL
-#define SPRN_DOORBELL_CPUTAG		SPRN_PIR
-#define PPC_DBELL_TAG_MASK		0x3fff
 
 static inline void _ppc_msgsnd(u32 msg)
 {
 	__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
 }
 
+/* sync before sending message */
+static inline void ppc_msgsnd_sync(void)
+{
+	__asm__ __volatile__ ("sync" : : : "memory");
+}
+
+/* sync after taking message interrupt */
+static inline void ppc_msgsync(void)
+{
+}
+
 #endif /* CONFIG_PPC_BOOK3S */
 
-extern void doorbell_cause_ipi(int cpu, unsigned long data);
+extern void doorbell_global_ipi(int cpu);
+extern void doorbell_core_ipi(int cpu);
+extern int doorbell_try_core_ipi(int cpu);
 extern void doorbell_exception(struct pt_regs *regs);
-extern void doorbell_setup_this_cpu(void);
 
 static inline void ppc_msgsnd(enum ppc_dbell type, u32 flags, u32 tag)
 {
diff --git a/arch/powerpc/include/asm/debug.h b/arch/powerpc/include/asm/debug.h
index 86308f177f2d..5d5af3fddfd8 100644
--- a/arch/powerpc/include/asm/debug.h
+++ b/arch/powerpc/include/asm/debug.h
@@ -8,8 +8,6 @@
 
 struct pt_regs;
 
-extern struct dentry *powerpc_debugfs_root;
-
 #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC_CORE)
 
 extern int (*__debugger)(struct pt_regs *regs);
diff --git a/arch/powerpc/include/asm/debugfs.h b/arch/powerpc/include/asm/debugfs.h
new file mode 100644
index 000000000000..4f3b39f3e3d2
--- /dev/null
+++ b/arch/powerpc/include/asm/debugfs.h
@@ -0,0 +1,17 @@
+#ifndef _ASM_POWERPC_DEBUGFS_H
+#define _ASM_POWERPC_DEBUGFS_H
+
+/*
+ * Copyright 2017, Michael Ellerman, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/debugfs.h>
+
+extern struct dentry *powerpc_debugfs_root;
+
+#endif /* _ASM_POWERPC_DEBUGFS_H */
diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index ed3beadd2cc5..183d73b6ed99 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -167,17 +167,14 @@ BEGIN_FTR_SECTION_NESTED(943)						\
 	std	ra,offset(r13);						\
 END_FTR_SECTION_NESTED(ftr,ftr,943)
 
-#define EXCEPTION_PROLOG_0_PACA(area)					\
+#define EXCEPTION_PROLOG_0(area)					\
+	GET_PACA(r13);							\
 	std	r9,area+EX_R9(r13);	/* save r9 */			\
 	OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR);			\
 	HMT_MEDIUM;							\
 	std	r10,area+EX_R10(r13);	/* save r10 - r12 */		\
 	OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR)
 
-#define EXCEPTION_PROLOG_0(area)					\
-	GET_PACA(r13);							\
-	EXCEPTION_PROLOG_0_PACA(area)
-
 #define __EXCEPTION_PROLOG_1(area, extra, vec)				\
 	OPT_SAVE_REG_TO_PACA(area+EX_PPR, r9, CPU_FTR_HAS_PPR);		\
 	OPT_SAVE_REG_TO_PACA(area+EX_CFAR, r10, CPU_FTR_CFAR);		\
@@ -203,17 +200,26 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 #define EXCEPTION_PROLOG_PSERIES_1(label, h)				\
 	__EXCEPTION_PROLOG_PSERIES_1(label, h)
 
+/* _NORI variant keeps MSR_RI clear */
+#define __EXCEPTION_PROLOG_PSERIES_1_NORI(label, h)			\
+	ld	r10,PACAKMSR(r13);	/* get MSR value for kernel */	\
+	xori	r10,r10,MSR_RI;		/* Clear MSR_RI */		\
+	mfspr	r11,SPRN_##h##SRR0;	/* save SRR0 */			\
+	LOAD_HANDLER(r12,label)						\
+	mtspr	SPRN_##h##SRR0,r12;					\
+	mfspr	r12,SPRN_##h##SRR1;	/* and SRR1 */			\
+	mtspr	SPRN_##h##SRR1,r10;					\
+	h##rfid;							\
+	b	.	/* prevent speculative execution */
+
+#define EXCEPTION_PROLOG_PSERIES_1_NORI(label, h)			\
+	__EXCEPTION_PROLOG_PSERIES_1_NORI(label, h)
+
 #define EXCEPTION_PROLOG_PSERIES(area, label, h, extra, vec)		\
 	EXCEPTION_PROLOG_0(area);					\
 	EXCEPTION_PROLOG_1(area, extra, vec);				\
 	EXCEPTION_PROLOG_PSERIES_1(label, h);
 
-/* Have the PACA in r13 already */
-#define EXCEPTION_PROLOG_PSERIES_PACA(area, label, h, extra, vec)	\
-	EXCEPTION_PROLOG_0_PACA(area);					\
-	EXCEPTION_PROLOG_1(area, extra, vec);				\
-	EXCEPTION_PROLOG_PSERIES_1(label, h);
-
 #define __KVMTEST(h, n)							\
 	lbz	r10,HSTATE_IN_GUEST(r13);				\
 	cmpwi	r10,0;							\
@@ -256,11 +262,6 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	ld	r9,area+EX_R9(r13);					\
 	bctr
 
-#define BRANCH_TO_KVM(reg, label)					\
-	__LOAD_FAR_HANDLER(reg, label);					\
-	mtctr	reg;							\
-	bctr
-
 #else
 #define BRANCH_TO_COMMON(reg, label)					\
 	b	label
@@ -268,15 +269,18 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 #define BRANCH_LINK_TO_FAR(label)					\
 	bl	label
 
-#define BRANCH_TO_KVM(reg, label)					\
-	b	label
-
 #define __BRANCH_TO_KVM_EXIT(area, label)				\
 	ld	r9,area+EX_R9(r13);					\
 	b	label
 
 #endif
 
+/* Do not enable RI */
+#define EXCEPTION_PROLOG_PSERIES_NORI(area, label, h, extra, vec)	\
+	EXCEPTION_PROLOG_0(area);					\
+	EXCEPTION_PROLOG_1(area, extra, vec);				\
+	EXCEPTION_PROLOG_PSERIES_1_NORI(label, h);
+
 
 #define __KVM_HANDLER(area, h, n)					\
 	BEGIN_FTR_SECTION_NESTED(947)					\
@@ -325,6 +329,15 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 
 #define NOTEST(n)
 
+#define EXCEPTION_PROLOG_COMMON_1()					   \
+	std	r9,_CCR(r1);		/* save CR in stackframe	*/ \
+	std	r11,_NIP(r1);		/* save SRR0 in stackframe	*/ \
+	std	r12,_MSR(r1);		/* save SRR1 in stackframe	*/ \
+	std	r10,0(r1);		/* make stack chain pointer	*/ \
+	std	r0,GPR0(r1);		/* save r0 in stackframe	*/ \
+	std	r10,GPR1(r1);		/* save r1 in stackframe	*/ \
+
+
 /*
  * The common exception prolog is used for all except a few exceptions
  * such as a segment miss on a kernel address.  We have to be prepared
@@ -349,12 +362,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	addi	r3,r13,area;		/* r3 -> where regs are saved*/	   \
 	RESTORE_CTR(r1, area);						   \
 	b	bad_stack;						   \
-3:	std	r9,_CCR(r1);		/* save CR in stackframe	*/ \
-	std	r11,_NIP(r1);		/* save SRR0 in stackframe	*/ \
-	std	r12,_MSR(r1);		/* save SRR1 in stackframe	*/ \
-	std	r10,0(r1);		/* make stack chain pointer	*/ \
-	std	r0,GPR0(r1);		/* save r0 in stackframe	*/ \
-	std	r10,GPR1(r1);		/* save r1 in stackframe	*/ \
+3:	EXCEPTION_PROLOG_COMMON_1();					   \
 	beq	4f;			/* if from kernel mode		*/ \
 	ACCOUNT_CPU_USER_ENTRY(r13, r9, r10);				   \
 	SAVE_PPR(area, r9, r10);					   \
@@ -522,7 +530,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 
 #define MASKABLE_RELON_EXCEPTION_HV_OOL(vec, label)			\
 	EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_TEST_HV, vec);		\
-	EXCEPTION_PROLOG_PSERIES_1(label, EXC_HV)
+	EXCEPTION_RELON_PROLOG_PSERIES_1(label, EXC_HV)
 
 /*
  * Our exception common code can be passed various "additions"
@@ -547,26 +555,39 @@ BEGIN_FTR_SECTION				\
 	beql	ppc64_runlatch_on_trampoline;	\
 END_FTR_SECTION_IFSET(CPU_FTR_CTRL)
 
-#define EXCEPTION_COMMON(trap, label, hdlr, ret, additions)	\
-	EXCEPTION_PROLOG_COMMON(trap, PACA_EXGEN);		\
+#define EXCEPTION_COMMON(area, trap, label, hdlr, ret, additions) \
+	EXCEPTION_PROLOG_COMMON(trap, area);			\
 	/* Volatile regs are potentially clobbered here */	\
 	additions;						\
 	addi	r3,r1,STACK_FRAME_OVERHEAD;			\
 	bl	hdlr;						\
 	b	ret
 
+/*
+ * Exception where stack is already set in r1, r1 is saved in r10, and it
+ * continues rather than returns.
+ */
+#define EXCEPTION_COMMON_NORET_STACK(area, trap, label, hdlr, additions) \
+	EXCEPTION_PROLOG_COMMON_1();				\
+	EXCEPTION_PROLOG_COMMON_2(area);			\
+	EXCEPTION_PROLOG_COMMON_3(trap);			\
+	/* Volatile regs are potentially clobbered here */	\
+	additions;						\
+	addi	r3,r1,STACK_FRAME_OVERHEAD;			\
+	bl	hdlr
+
 #define STD_EXCEPTION_COMMON(trap, label, hdlr)			\
-	EXCEPTION_COMMON(trap, label, hdlr, ret_from_except,	\
-			 ADD_NVGPRS;ADD_RECONCILE)
+	EXCEPTION_COMMON(PACA_EXGEN, trap, label, hdlr,		\
+		ret_from_except, ADD_NVGPRS;ADD_RECONCILE)
 
 /*
  * Like STD_EXCEPTION_COMMON, but for exceptions that can occur
  * in the idle task and therefore need the special idle handling
  * (finish nap and runlatch)
  */
-#define STD_EXCEPTION_COMMON_ASYNC(trap, label, hdlr)		  \
-	EXCEPTION_COMMON(trap, label, hdlr, ret_from_except_lite, \
-			 FINISH_NAP;ADD_RECONCILE;RUNLATCH_ON)
+#define STD_EXCEPTION_COMMON_ASYNC(trap, label, hdlr)		\
+	EXCEPTION_COMMON(PACA_EXGEN, trap, label, hdlr,		\
+		ret_from_except_lite, FINISH_NAP;ADD_RECONCILE;RUNLATCH_ON)
 
 /*
  * When the idle code in power4_idle puts the CPU into NAP mode,
diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h
index ddf54f5bbdd1..2de2319b99e2 100644
--- a/arch/powerpc/include/asm/feature-fixups.h
+++ b/arch/powerpc/include/asm/feature-fixups.h
@@ -66,6 +66,9 @@ label##5:							\
 #define END_FTR_SECTION(msk, val)		\
 	END_FTR_SECTION_NESTED(msk, val, 97)
 
+#define END_FTR_SECTION_NESTED_IFSET(msk, label)	\
+	END_FTR_SECTION_NESTED((msk), (msk), label)
+
 #define END_FTR_SECTION_IFSET(msk)	END_FTR_SECTION((msk), (msk))
 #define END_FTR_SECTION_IFCLR(msk)	END_FTR_SECTION((msk), 0)
 
diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h
index 5067048daad4..86eb87382031 100644
--- a/arch/powerpc/include/asm/head-64.h
+++ b/arch/powerpc/include/asm/head-64.h
@@ -213,6 +213,7 @@ name:
 	USE_TEXT_SECTION();					\
 	.balign IFETCH_ALIGN_BYTES;				\
 	.global name;						\
+	_ASM_NOKPROBE_SYMBOL(name);				\
 	DEFINE_FIXED_SYMBOL(name);				\
 name:
 
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 3cc12a86ef5d..d73755fafbb0 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -377,16 +377,6 @@ long plpar_hcall_raw(unsigned long opcode, unsigned long *retbuf, ...);
 long plpar_hcall9(unsigned long opcode, unsigned long *retbuf, ...);
 long plpar_hcall9_raw(unsigned long opcode, unsigned long *retbuf, ...);
 
-/* For hcall instrumentation.  One structure per-hcall, per-CPU */
-struct hcall_stats {
-	unsigned long	num_calls;	/* number of calls (on this CPU) */
-	unsigned long	tb_total;	/* total wall time (mftb) of calls. */
-	unsigned long	purr_total;	/* total cpu time (PURR) of calls. */
-	unsigned long	tb_start;
-	unsigned long	purr_start;
-};
-#define HCALL_STAT_ARRAY_SIZE	((MAX_HCALL_OPCODE >> 2) + 1)
-
 struct hvcall_mpp_data {
 	unsigned long entitled_mem;
 	unsigned long mapped_mem;
diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 5ed292431b5b..422f99cf9924 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -25,8 +25,6 @@ extern struct pci_dev *isa_bridge_pcidev;
 #endif
 
 #include <linux/device.h>
-#include <linux/io.h>
-
 #include <linux/compiler.h>
 #include <asm/page.h>
 #include <asm/byteorder.h>
@@ -192,24 +190,8 @@ DEF_MMIO_OUT_D(out_le32, 32, stw);
 
 #endif /* __BIG_ENDIAN */
 
-/*
- * Cache inhibitied accessors for use in real mode, you don't want to use these
- * unless you know what you're doing.
- *
- * NB. These use the cpu byte ordering.
- */
-DEF_MMIO_OUT_X(out_rm8,   8, stbcix);
-DEF_MMIO_OUT_X(out_rm16, 16, sthcix);
-DEF_MMIO_OUT_X(out_rm32, 32, stwcix);
-DEF_MMIO_IN_X(in_rm8,   8, lbzcix);
-DEF_MMIO_IN_X(in_rm16, 16, lhzcix);
-DEF_MMIO_IN_X(in_rm32, 32, lwzcix);
-
 #ifdef __powerpc64__
 
-DEF_MMIO_OUT_X(out_rm64, 64, stdcix);
-DEF_MMIO_IN_X(in_rm64, 64, ldcix);
-
 #ifdef __BIG_ENDIAN__
 DEF_MMIO_OUT_D(out_be64, 64, std);
 DEF_MMIO_IN_D(in_be64, 64, ld);
@@ -242,35 +224,6 @@ static inline void out_be64(volatile u64 __iomem *addr, u64 val)
 #endif
 #endif /* __powerpc64__ */
 
-
-/*
- * Simple Cache inhibited accessors
- * Unlike the DEF_MMIO_* macros, these don't include any h/w memory
- * barriers, callers need to manage memory barriers on their own.
- * These can only be used in hypervisor real mode.
- */
-
-static inline u32 _lwzcix(unsigned long addr)
-{
-	u32 ret;
-
-	__asm__ __volatile__("lwzcix %0,0, %1"
-			     : "=r" (ret) : "r" (addr) : "memory");
-	return ret;
-}
-
-static inline void _stbcix(u64 addr, u8 val)
-{
-	__asm__ __volatile__("stbcix %0,0,%1"
-		: : "r" (val), "r" (addr) : "memory");
-}
-
-static inline void _stwcix(u64 addr, u32 val)
-{
-	__asm__ __volatile__("stwcix %0,0,%1"
-		: : "r" (val), "r" (addr) : "memory");
-}
-
 /*
  * Low level IO stream instructions are defined out of line for now
  */
@@ -417,15 +370,64 @@ static inline void __raw_writeq(unsigned long v, volatile void __iomem *addr)
 }
 
 /*
- * Real mode version of the above. stdcix is only supposed to be used
- * in hypervisor real mode as per the architecture spec.
+ * Real mode versions of the above. Those instructions are only supposed
+ * to be used in hypervisor real mode as per the architecture spec.
  */
+static inline void __raw_rm_writeb(u8 val, volatile void __iomem *paddr)
+{
+	__asm__ __volatile__("stbcix %0,0,%1"
+		: : "r" (val), "r" (paddr) : "memory");
+}
+
+static inline void __raw_rm_writew(u16 val, volatile void __iomem *paddr)
+{
+	__asm__ __volatile__("sthcix %0,0,%1"
+		: : "r" (val), "r" (paddr) : "memory");
+}
+
+static inline void __raw_rm_writel(u32 val, volatile void __iomem *paddr)
+{
+	__asm__ __volatile__("stwcix %0,0,%1"
+		: : "r" (val), "r" (paddr) : "memory");
+}
+
 static inline void __raw_rm_writeq(u64 val, volatile void __iomem *paddr)
 {
 	__asm__ __volatile__("stdcix %0,0,%1"
 		: : "r" (val), "r" (paddr) : "memory");
 }
 
+static inline u8 __raw_rm_readb(volatile void __iomem *paddr)
+{
+	u8 ret;
+	__asm__ __volatile__("lbzcix %0,0, %1"
+			     : "=r" (ret) : "r" (paddr) : "memory");
+	return ret;
+}
+
+static inline u16 __raw_rm_readw(volatile void __iomem *paddr)
+{
+	u16 ret;
+	__asm__ __volatile__("lhzcix %0,0, %1"
+			     : "=r" (ret) : "r" (paddr) : "memory");
+	return ret;
+}
+
+static inline u32 __raw_rm_readl(volatile void __iomem *paddr)
+{
+	u32 ret;
+	__asm__ __volatile__("lwzcix %0,0, %1"
+			     : "=r" (ret) : "r" (paddr) : "memory");
+	return ret;
+}
+
+static inline u64 __raw_rm_readq(volatile void __iomem *paddr)
+{
+	u64 ret;
+	__asm__ __volatile__("ldcix %0,0, %1"
+			     : "=r" (ret) : "r" (paddr) : "memory");
+	return ret;
+}
 #endif /* __powerpc64__ */
 
 /*
@@ -757,6 +759,8 @@ extern void __iomem *ioremap_prot(phys_addr_t address, unsigned long size,
 extern void __iomem *ioremap_wc(phys_addr_t address, unsigned long size);
 #define ioremap_nocache(addr, size)	ioremap((addr), (size))
 #define ioremap_uc(addr, size)		ioremap((addr), (size))
+#define ioremap_cache(addr, size) \
+	ioremap_prot((addr), (size), pgprot_val(PAGE_KERNEL))
 
 extern void iounmap(volatile void __iomem *addr);
 
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 2c1d50792944..d96142572e6d 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -64,6 +64,11 @@ struct iommu_table_ops {
 			long index,
 			unsigned long *hpa,
 			enum dma_data_direction *direction);
+	/* Real mode */
+	int (*exchange_rm)(struct iommu_table *tbl,
+			long index,
+			unsigned long *hpa,
+			enum dma_data_direction *direction);
 #endif
 	void (*clear)(struct iommu_table *tbl,
 			long index, long npages);
@@ -114,6 +119,7 @@ struct iommu_table {
 	struct list_head it_group_list;/* List of iommu_table_group_link */
 	unsigned long *it_userspace; /* userspace view of the table */
 	struct iommu_table_ops *it_ops;
+	struct kref    it_kref;
 };
 
 #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
@@ -146,8 +152,8 @@ static inline void *get_iommu_table_base(struct device *dev)
 
 extern int dma_iommu_dma_supported(struct device *dev, u64 mask);
 
-/* Frees table for an individual device node */
-extern void iommu_free_table(struct iommu_table *tbl, const char *node_name);
+extern struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl);
+extern int iommu_tce_table_put(struct iommu_table *tbl);
 
 /* Initializes an iommu_table based in values set in the passed-in
  * structure
@@ -208,6 +214,8 @@ extern void iommu_del_device(struct device *dev);
 extern int __init tce_iommu_bus_notifier_init(void);
 extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
 		unsigned long *hpa, enum dma_data_direction *direction);
+extern long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry,
+		unsigned long *hpa, enum dma_data_direction *direction);
 #else
 static inline void iommu_register_group(struct iommu_table_group *table_group,
 					int pci_domain_number,
diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h
index 0503c98b2117..a83821f33ea3 100644
--- a/arch/powerpc/include/asm/kprobes.h
+++ b/arch/powerpc/include/asm/kprobes.h
@@ -61,59 +61,6 @@ extern kprobe_opcode_t optprobe_template_end[];
 #define MAX_OPTINSN_SIZE	(optprobe_template_end - optprobe_template_entry)
 #define RELATIVEJUMP_SIZE	sizeof(kprobe_opcode_t)	/* 4 bytes */
 
-#ifdef PPC64_ELF_ABI_v2
-/* PPC64 ABIv2 needs local entry point */
-#define kprobe_lookup_name(name, addr)					\
-{									\
-	addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);		\
-	if (addr)							\
-		addr = (kprobe_opcode_t *)ppc_function_entry(addr);	\
-}
-#elif defined(PPC64_ELF_ABI_v1)
-/*
- * 64bit powerpc ABIv1 uses function descriptors:
- * - Check for the dot variant of the symbol first.
- * - If that fails, try looking up the symbol provided.
- *
- * This ensures we always get to the actual symbol and not the descriptor.
- * Also handle <module:symbol> format.
- */
-#define kprobe_lookup_name(name, addr)					\
-{									\
-	char dot_name[MODULE_NAME_LEN + 1 + KSYM_NAME_LEN];		\
-	const char *modsym;							\
-	bool dot_appended = false;					\
-	if ((modsym = strchr(name, ':')) != NULL) {			\
-		modsym++;						\
-		if (*modsym != '\0' && *modsym != '.') {		\
-			/* Convert to <module:.symbol> */		\
-			strncpy(dot_name, name, modsym - name);		\
-			dot_name[modsym - name] = '.';			\
-			dot_name[modsym - name + 1] = '\0';		\
-			strncat(dot_name, modsym,			\
-				sizeof(dot_name) - (modsym - name) - 2);\
-			dot_appended = true;				\
-		} else {						\
-			dot_name[0] = '\0';				\
-			strncat(dot_name, name, sizeof(dot_name) - 1);	\
-		}							\
-	} else if (name[0] != '.') {					\
-		dot_name[0] = '.';					\
-		dot_name[1] = '\0';					\
-		strncat(dot_name, name, KSYM_NAME_LEN - 2);		\
-		dot_appended = true;					\
-	} else {							\
-		dot_name[0] = '\0';					\
-		strncat(dot_name, name, KSYM_NAME_LEN - 1);		\
-	}								\
-	addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name);	\
-	if (!addr && dot_appended) {					\
-		/* Let's try the original non-dot symbol lookup	*/	\
-		addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);	\
-	}								\
-}
-#endif
-
 #define flush_insn_slot(p)	do { } while (0)
 #define kretprobe_blacklist_size 0
 
@@ -156,6 +103,16 @@ extern int kprobe_exceptions_notify(struct notifier_block *self,
 extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
 extern int kprobe_handler(struct pt_regs *regs);
 extern int kprobe_post_handler(struct pt_regs *regs);
+#ifdef CONFIG_KPROBES_ON_FTRACE
+extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+			   struct kprobe_ctlblk *kcb);
+#else
+static inline int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+				  struct kprobe_ctlblk *kcb)
+{
+	return 0;
+}
+#endif
 #else
 static inline int kprobe_handler(struct pt_regs *regs) { return 0; }
 static inline int kprobe_post_handler(struct pt_regs *regs) { return 0; }
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index d9b48f5bb606..d55c7f881ce7 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -49,8 +49,6 @@ static inline bool kvm_is_radix(struct kvm *kvm)
 #define KVM_DEFAULT_HPT_ORDER	24	/* 16MB HPT by default */
 #endif
 
-#define VRMA_VSID	0x1ffffffUL	/* 1TB VSID reserved for VRMA */
-
 /*
  * We use a lock bit in HPTE dword 0 to synchronize updates and
  * accesses to each HPTE, and another bit to indicate non-present
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index d318d432caa9..0593d9479f74 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -110,7 +110,7 @@ struct kvmppc_host_state {
 	u8 ptid;
 	struct kvm_vcpu *kvm_vcpu;
 	struct kvmppc_vcore *kvm_vcore;
-	unsigned long xics_phys;
+	void __iomem *xics_phys;
 	u32 saved_xirr;
 	u64 dabr;
 	u64 host_mmcr[7];	/* MMCR 0,1,A, SIAR, SDAR, MMCR2, SIER */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index dd11c4c8c56a..c3877992eff9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -409,7 +409,7 @@ struct openpic;
 extern void kvm_cma_reserve(void) __init;
 static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {
-	paca[cpu].kvm_hstate.xics_phys = addr;
+	paca[cpu].kvm_hstate.xics_phys = (void __iomem *)addr;
 }
 
 static inline u32 kvmppc_get_xics_latch(void)
@@ -478,8 +478,6 @@ extern void kvmppc_free_host_rm_ops(void);
 extern void kvmppc_free_pimap(struct kvm *kvm);
 extern int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall);
 extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
-extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server);
-extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
 extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd);
 extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu);
 extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
@@ -507,12 +505,6 @@ static inline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
 static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
 	{ return 0; }
 static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
-static inline int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu,
-					 unsigned long server)
-	{ return -EINVAL; }
-static inline int kvm_vm_ioctl_xics_irq(struct kvm *kvm,
-					struct kvm_irq_level *args)
-	{ return -ENOTTY; }
 static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
 	{ return 0; }
 #endif
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index ed62efe01e49..81eff8631434 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -24,97 +24,6 @@
 
 #include <linux/bitops.h>
 
-/*
- * Machine Check bits on power7 and power8
- */
-#define P7_SRR1_MC_LOADSTORE(srr1)	((srr1) & PPC_BIT(42)) /* P8 too */
-
-/* SRR1 bits for machine check (On Power7 and Power8) */
-#define P7_SRR1_MC_IFETCH(srr1)	((srr1) & PPC_BITMASK(43, 45)) /* P8 too */
-
-#define P7_SRR1_MC_IFETCH_UE		(0x1 << PPC_BITLSHIFT(45)) /* P8 too */
-#define P7_SRR1_MC_IFETCH_SLB_PARITY	(0x2 << PPC_BITLSHIFT(45)) /* P8 too */
-#define P7_SRR1_MC_IFETCH_SLB_MULTIHIT	(0x3 << PPC_BITLSHIFT(45)) /* P8 too */
-#define P7_SRR1_MC_IFETCH_SLB_BOTH	(0x4 << PPC_BITLSHIFT(45))
-#define P7_SRR1_MC_IFETCH_TLB_MULTIHIT	(0x5 << PPC_BITLSHIFT(45)) /* P8 too */
-#define P7_SRR1_MC_IFETCH_UE_TLB_RELOAD	(0x6 << PPC_BITLSHIFT(45)) /* P8 too */
-#define P7_SRR1_MC_IFETCH_UE_IFU_INTERNAL	(0x7 << PPC_BITLSHIFT(45))
-
-/* SRR1 bits for machine check (On Power8) */
-#define P8_SRR1_MC_IFETCH_ERAT_MULTIHIT	(0x4 << PPC_BITLSHIFT(45))
-
-/* DSISR bits for machine check (On Power7 and Power8) */
-#define P7_DSISR_MC_UE			(PPC_BIT(48))	/* P8 too */
-#define P7_DSISR_MC_UE_TABLEWALK	(PPC_BIT(49))	/* P8 too */
-#define P7_DSISR_MC_ERAT_MULTIHIT	(PPC_BIT(52))	/* P8 too */
-#define P7_DSISR_MC_TLB_MULTIHIT_MFTLB	(PPC_BIT(53))	/* P8 too */
-#define P7_DSISR_MC_SLB_PARITY_MFSLB	(PPC_BIT(55))	/* P8 too */
-#define P7_DSISR_MC_SLB_MULTIHIT	(PPC_BIT(56))	/* P8 too */
-#define P7_DSISR_MC_SLB_MULTIHIT_PARITY	(PPC_BIT(57))	/* P8 too */
-
-/*
- * DSISR bits for machine check (Power8) in addition to above.
- * Secondary DERAT Multihit
- */
-#define P8_DSISR_MC_ERAT_MULTIHIT_SEC	(PPC_BIT(54))
-
-/* SLB error bits */
-#define P7_DSISR_MC_SLB_ERRORS		(P7_DSISR_MC_ERAT_MULTIHIT | \
-					 P7_DSISR_MC_SLB_PARITY_MFSLB | \
-					 P7_DSISR_MC_SLB_MULTIHIT | \
-					 P7_DSISR_MC_SLB_MULTIHIT_PARITY)
-
-#define P8_DSISR_MC_SLB_ERRORS		(P7_DSISR_MC_SLB_ERRORS | \
-					 P8_DSISR_MC_ERAT_MULTIHIT_SEC)
-
-/*
- * Machine Check bits on power9
- */
-#define P9_SRR1_MC_LOADSTORE(srr1)	(((srr1) >> PPC_BITLSHIFT(42)) & 1)
-
-#define P9_SRR1_MC_IFETCH(srr1)	(	\
-	PPC_BITEXTRACT(srr1, 45, 0) |	\
-	PPC_BITEXTRACT(srr1, 44, 1) |	\
-	PPC_BITEXTRACT(srr1, 43, 2) |	\
-	PPC_BITEXTRACT(srr1, 36, 3) )
-
-/* 0 is reserved */
-#define P9_SRR1_MC_IFETCH_UE				1
-#define P9_SRR1_MC_IFETCH_SLB_PARITY			2
-#define P9_SRR1_MC_IFETCH_SLB_MULTIHIT			3
-#define P9_SRR1_MC_IFETCH_ERAT_MULTIHIT			4
-#define P9_SRR1_MC_IFETCH_TLB_MULTIHIT			5
-#define P9_SRR1_MC_IFETCH_UE_TLB_RELOAD			6
-/* 7 is reserved */
-#define P9_SRR1_MC_IFETCH_LINK_TIMEOUT			8
-#define P9_SRR1_MC_IFETCH_LINK_TABLEWALK_TIMEOUT	9
-/* 10 ? */
-#define P9_SRR1_MC_IFETCH_RA			11
-#define P9_SRR1_MC_IFETCH_RA_TABLEWALK		12
-#define P9_SRR1_MC_IFETCH_RA_ASYNC_STORE		13
-#define P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT	14
-#define P9_SRR1_MC_IFETCH_RA_TABLEWALK_FOREIGN	15
-
-/* DSISR bits for machine check (On Power9) */
-#define P9_DSISR_MC_UE					(PPC_BIT(48))
-#define P9_DSISR_MC_UE_TABLEWALK			(PPC_BIT(49))
-#define P9_DSISR_MC_LINK_LOAD_TIMEOUT			(PPC_BIT(50))
-#define P9_DSISR_MC_LINK_TABLEWALK_TIMEOUT		(PPC_BIT(51))
-#define P9_DSISR_MC_ERAT_MULTIHIT			(PPC_BIT(52))
-#define P9_DSISR_MC_TLB_MULTIHIT_MFTLB			(PPC_BIT(53))
-#define P9_DSISR_MC_USER_TLBIE				(PPC_BIT(54))
-#define P9_DSISR_MC_SLB_PARITY_MFSLB			(PPC_BIT(55))
-#define P9_DSISR_MC_SLB_MULTIHIT_MFSLB			(PPC_BIT(56))
-#define P9_DSISR_MC_RA_LOAD				(PPC_BIT(57))
-#define P9_DSISR_MC_RA_TABLEWALK			(PPC_BIT(58))
-#define P9_DSISR_MC_RA_TABLEWALK_FOREIGN		(PPC_BIT(59))
-#define P9_DSISR_MC_RA_FOREIGN				(PPC_BIT(60))
-
-/* SLB error bits */
-#define P9_DSISR_MC_SLB_ERRORS		(P9_DSISR_MC_ERAT_MULTIHIT | \
-					 P9_DSISR_MC_SLB_PARITY_MFSLB | \
-					 P9_DSISR_MC_SLB_MULTIHIT_MFSLB)
-
 enum MCE_Version {
 	MCE_V1 = 1,
 };
@@ -298,7 +207,8 @@ extern void save_mce_event(struct pt_regs *regs, long handled,
 extern int get_mce_event(struct machine_check_event *mce, bool release);
 extern void release_mce_event(void);
 extern void machine_check_queue_event(void);
-extern void machine_check_print_event_info(struct machine_check_event *evt);
+extern void machine_check_print_event_info(struct machine_check_event *evt,
+					   bool user_mode);
 extern uint64_t get_mce_fault_addr(struct machine_check_event *evt);
 
 #endif /* __ASM_PPC64_MCE_H__ */
diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h
index b62a8d43a06c..7ca8d8e80ffa 100644
--- a/arch/powerpc/include/asm/mmu-book3e.h
+++ b/arch/powerpc/include/asm/mmu-book3e.h
@@ -229,11 +229,6 @@ typedef struct {
 	unsigned int	id;
 	unsigned int	active;
 	unsigned long	vdso_base;
-#ifdef CONFIG_PPC_MM_SLICES
-	u64 low_slices_psize;   /* SLB page size encodings */
-	u64 high_slices_psize;  /* 4 bits per slice for now */
-	u16 user_psize;         /* page size index */
-#endif
 #ifdef CONFIG_PPC_64K_PAGES
 	/* for 4K PTE fragment support */
 	void *pte_frag;
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index 065e762fae85..78260409dc9c 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -29,6 +29,10 @@
  */
 
 /*
+ * Support for 68 bit VA space. We added that from ISA 2.05
+ */
+#define MMU_FTR_68_BIT_VA		ASM_CONST(0x00002000)
+/*
  * Kernel read only support.
  * We added the ppp value 0b110 in ISA 2.04.
  */
@@ -109,10 +113,10 @@
 #define MMU_FTRS_POWER4		MMU_FTRS_DEFAULT_HPTE_ARCH_V2
 #define MMU_FTRS_PPC970		MMU_FTRS_POWER4 | MMU_FTR_TLBIE_CROP_VA
 #define MMU_FTRS_POWER5		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE
-#define MMU_FTRS_POWER6		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO
-#define MMU_FTRS_POWER7		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO
-#define MMU_FTRS_POWER8		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO
-#define MMU_FTRS_POWER9		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO
+#define MMU_FTRS_POWER6		MMU_FTRS_POWER5 | MMU_FTR_KERNEL_RO | MMU_FTR_68_BIT_VA
+#define MMU_FTRS_POWER7		MMU_FTRS_POWER6
+#define MMU_FTRS_POWER8		MMU_FTRS_POWER6
+#define MMU_FTRS_POWER9		MMU_FTRS_POWER6
 #define MMU_FTRS_CELL		MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \
 				MMU_FTR_CI_LARGE_PAGE
 #define MMU_FTRS_PA6T		MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \
@@ -136,7 +140,7 @@ enum {
 		MMU_FTR_NO_SLBIE_B | MMU_FTR_16M_PAGE | MMU_FTR_TLBIEL |
 		MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_CI_LARGE_PAGE |
 		MMU_FTR_1T_SEGMENT | MMU_FTR_TLBIE_CROP_VA |
-		MMU_FTR_KERNEL_RO |
+		MMU_FTR_KERNEL_RO | MMU_FTR_68_BIT_VA |
 #ifdef CONFIG_PPC_RADIX_MMU
 		MMU_FTR_TYPE_RADIX |
 #endif
@@ -290,7 +294,10 @@ static inline bool early_radix_enabled(void)
 #define MMU_PAGE_16G	14
 #define MMU_PAGE_64G	15
 
-/* N.B. we need to change the type of hpte_page_sizes if this gets to be > 16 */
+/*
+ * N.B. we need to change the type of hpte_page_sizes if this gets to be > 16
+ * Also we need to change he type of mm_context.low/high_slices_psize.
+ */
 #define MMU_PAGE_COUNT	16
 
 #ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index ecf9885ab660..da7e9432fa8f 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -29,10 +29,14 @@ extern void mm_iommu_init(struct mm_struct *mm);
 extern void mm_iommu_cleanup(struct mm_struct *mm);
 extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
 		unsigned long ua, unsigned long size);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(
+		struct mm_struct *mm, unsigned long ua, unsigned long size);
 extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
 		unsigned long ua, unsigned long entries);
 extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
 		unsigned long ua, unsigned long *hpa);
+extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
+		unsigned long ua, unsigned long *hpa);
 extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
 extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
 #endif
@@ -51,7 +55,8 @@ static inline void switch_mmu_context(struct mm_struct *prev,
 	return switch_slb(tsk, next);
 }
 
-extern int __init_new_context(void);
+extern int hash__alloc_context_id(void);
+extern void hash__reserve_context_id(int id);
 extern void __destroy_context(int context_id);
 static inline void mmu_context_init(void) { }
 #else
@@ -70,8 +75,9 @@ extern void drop_cop(unsigned long acop, struct mm_struct *mm);
  * switch_mm is the entry point called from the architecture independent
  * code in kernel/sched/core.c
  */
-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
-			     struct task_struct *tsk)
+static inline void switch_mm_irqs_off(struct mm_struct *prev,
+				      struct mm_struct *next,
+				      struct task_struct *tsk)
 {
 	/* Mark this context has been used on the new CPU */
 	if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next)))
@@ -110,6 +116,18 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	switch_mmu_context(prev, next, tsk);
 }
 
+static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+			     struct task_struct *tsk)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	switch_mm_irqs_off(prev, next, tsk);
+	local_irq_restore(flags);
+}
+#define switch_mm_irqs_off switch_mm_irqs_off
+
+
 #define deactivate_mm(tsk,mm)	do { } while (0)
 
 /*
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index c7f927e67d14..f0ff384d4ca5 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -88,11 +88,6 @@
 #include <asm/nohash/pte-book3e.h>
 #include <asm/pte-common.h>
 
-#ifdef CONFIG_PPC_MM_SLICES
-#define HAVE_ARCH_UNMAPPED_AREA
-#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
-#endif /* CONFIG_PPC_MM_SLICES */
-
 #ifndef __ASSEMBLY__
 /* pte_clear moved to later in this file */
 
diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index a0aa285869b5..cb3e6242a78c 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -40,6 +40,8 @@
 #define OPAL_I2C_ARBT_LOST	-22
 #define OPAL_I2C_NACK_RCVD	-23
 #define OPAL_I2C_STOP_ERR	-24
+#define OPAL_XIVE_PROVISIONING	-31
+#define OPAL_XIVE_FREE_ACTIVE	-32
 
 /* API Tokens (in r0) */
 #define OPAL_INVALID_CALL		       -1
@@ -168,7 +170,27 @@
 #define OPAL_INT_SET_MFRR			125
 #define OPAL_PCI_TCE_KILL			126
 #define OPAL_NMMU_SET_PTCR			127
-#define OPAL_LAST				127
+#define OPAL_XIVE_RESET				128
+#define OPAL_XIVE_GET_IRQ_INFO			129
+#define OPAL_XIVE_GET_IRQ_CONFIG		130
+#define OPAL_XIVE_SET_IRQ_CONFIG		131
+#define OPAL_XIVE_GET_QUEUE_INFO		132
+#define OPAL_XIVE_SET_QUEUE_INFO		133
+#define OPAL_XIVE_DONATE_PAGE			134
+#define OPAL_XIVE_ALLOCATE_VP_BLOCK		135
+#define OPAL_XIVE_FREE_VP_BLOCK			136
+#define OPAL_XIVE_GET_VP_INFO			137
+#define OPAL_XIVE_SET_VP_INFO			138
+#define OPAL_XIVE_ALLOCATE_IRQ			139
+#define OPAL_XIVE_FREE_IRQ			140
+#define OPAL_XIVE_SYNC				141
+#define OPAL_XIVE_DUMP				142
+#define OPAL_XIVE_RESERVED3			143
+#define OPAL_XIVE_RESERVED4			144
+#define OPAL_NPU_INIT_CONTEXT			146
+#define OPAL_NPU_DESTROY_CONTEXT		147
+#define OPAL_NPU_MAP_LPAR			148
+#define OPAL_LAST				148
 
 /* Device tree flags */
 
@@ -928,6 +950,59 @@ enum {
 	OPAL_PCI_TCE_KILL_ALL,
 };
 
+/* The xive operation mode indicates the active "API" and
+ * corresponds to the "mode" parameter of the opal_xive_reset()
+ * call
+ */
+enum {
+	OPAL_XIVE_MODE_EMU	= 0,
+	OPAL_XIVE_MODE_EXPL	= 1,
+};
+
+/* Flags for OPAL_XIVE_GET_IRQ_INFO */
+enum {
+	OPAL_XIVE_IRQ_TRIGGER_PAGE	= 0x00000001,
+	OPAL_XIVE_IRQ_STORE_EOI		= 0x00000002,
+	OPAL_XIVE_IRQ_LSI		= 0x00000004,
+	OPAL_XIVE_IRQ_SHIFT_BUG		= 0x00000008,
+	OPAL_XIVE_IRQ_MASK_VIA_FW	= 0x00000010,
+	OPAL_XIVE_IRQ_EOI_VIA_FW	= 0x00000020,
+};
+
+/* Flags for OPAL_XIVE_GET/SET_QUEUE_INFO */
+enum {
+	OPAL_XIVE_EQ_ENABLED		= 0x00000001,
+	OPAL_XIVE_EQ_ALWAYS_NOTIFY	= 0x00000002,
+	OPAL_XIVE_EQ_ESCALATE		= 0x00000004,
+};
+
+/* Flags for OPAL_XIVE_GET/SET_VP_INFO */
+enum {
+	OPAL_XIVE_VP_ENABLED		= 0x00000001,
+};
+
+/* "Any chip" replacement for chip ID for allocation functions */
+enum {
+	OPAL_XIVE_ANY_CHIP		= 0xffffffff,
+};
+
+/* Xive sync options */
+enum {
+	/* This bits are cumulative, arg is a girq */
+	XIVE_SYNC_EAS			= 0x00000001, /* Sync irq source */
+	XIVE_SYNC_QUEUE			= 0x00000002, /* Sync irq target */
+};
+
+/* Dump options */
+enum {
+	XIVE_DUMP_TM_HYP	= 0,
+	XIVE_DUMP_TM_POOL	= 1,
+	XIVE_DUMP_TM_OS		= 2,
+	XIVE_DUMP_TM_USER	= 3,
+	XIVE_DUMP_VP		= 4,
+	XIVE_DUMP_EMU_STATE	= 5,
+};
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __OPAL_API_H */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 1ff03a6da76e..588fb1c23af9 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -29,6 +29,11 @@ extern struct device_node *opal_node;
 
 /* API functions */
 int64_t opal_invalid_call(void);
+int64_t opal_npu_destroy_context(uint64_t phb_id, uint64_t pid, uint64_t bdf);
+int64_t opal_npu_init_context(uint64_t phb_id, int pasid, uint64_t msr,
+			uint64_t bdf);
+int64_t opal_npu_map_lpar(uint64_t phb_id, uint64_t bdf, uint64_t lparid,
+			uint64_t lpcr);
 int64_t opal_console_write(int64_t term_number, __be64 *length,
 			   const uint8_t *buffer);
 int64_t opal_console_read(int64_t term_number, __be64 *length,
@@ -226,6 +231,42 @@ int64_t opal_pci_tce_kill(uint64_t phb_id, uint32_t kill_type,
 			  uint32_t pe_num, uint32_t tce_size,
 			  uint64_t dma_addr, uint32_t npages);
 int64_t opal_nmmu_set_ptcr(uint64_t chip_id, uint64_t ptcr);
+int64_t opal_xive_reset(uint64_t version);
+int64_t opal_xive_get_irq_info(uint32_t girq,
+			       __be64 *out_flags,
+			       __be64 *out_eoi_page,
+			       __be64 *out_trig_page,
+			       __be32 *out_esb_shift,
+			       __be32 *out_src_chip);
+int64_t opal_xive_get_irq_config(uint32_t girq, __be64 *out_vp,
+				 uint8_t *out_prio, __be32 *out_lirq);
+int64_t opal_xive_set_irq_config(uint32_t girq, uint64_t vp, uint8_t prio,
+				 uint32_t lirq);
+int64_t opal_xive_get_queue_info(uint64_t vp, uint32_t prio,
+				 __be64 *out_qpage,
+				 __be64 *out_qsize,
+				 __be64 *out_qeoi_page,
+				 __be32 *out_escalate_irq,
+				 __be64 *out_qflags);
+int64_t opal_xive_set_queue_info(uint64_t vp, uint32_t prio,
+				 uint64_t qpage,
+				 uint64_t qsize,
+				 uint64_t qflags);
+int64_t opal_xive_donate_page(uint32_t chip_id, uint64_t addr);
+int64_t opal_xive_alloc_vp_block(uint32_t alloc_order);
+int64_t opal_xive_free_vp_block(uint64_t vp);
+int64_t opal_xive_get_vp_info(uint64_t vp,
+			      __be64 *out_flags,
+			      __be64 *out_cam_value,
+			      __be64 *out_report_cl_pair,
+			      __be32 *out_chip_id);
+int64_t opal_xive_set_vp_info(uint64_t vp,
+			      uint64_t flags,
+			      uint64_t report_cl_pair);
+int64_t opal_xive_allocate_irq(uint32_t chip_id);
+int64_t opal_xive_free_irq(uint32_t girq);
+int64_t opal_xive_sync(uint32_t type, uint32_t id);
+int64_t opal_xive_dump(uint32_t type, uint32_t id);
 
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 708c3e592eeb..1c09f8fe2ee8 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -99,7 +99,6 @@ struct paca_struct {
 	 */
 	/* used for most interrupts/exceptions */
 	u64 exgen[13] __attribute__((aligned(0x80)));
-	u64 exmc[13];		/* used for machine checks */
 	u64 exslb[13];		/* used for SLB/segment table misses
  				 * on the linear mapping */
 	/* SLB related definitions */
@@ -139,6 +138,7 @@ struct paca_struct {
 #ifdef CONFIG_PPC_MM_SLICES
 	u64 mm_ctx_low_slices_psize;
 	unsigned char mm_ctx_high_slices_psize[SLICE_ARRAY_SIZE];
+	unsigned long addr_limit;
 #else
 	u16 mm_ctx_user_psize;
 	u16 mm_ctx_sllp;
@@ -172,17 +172,31 @@ struct paca_struct {
 	u8 thread_mask;
 	/* Mask to denote subcore sibling threads */
 	u8 subcore_sibling_mask;
+	/*
+	 * Pointer to an array which contains pointer
+	 * to the sibling threads' paca.
+	 */
+	struct paca_struct **thread_sibling_pacas;
 #endif
 
+#ifdef CONFIG_PPC_STD_MMU_64
+	/* Non-maskable exceptions that are not performance critical */
+	u64 exnmi[13];		/* used for system reset (nmi) */
+	u64 exmc[13];		/* used for machine checks */
+#endif
 #ifdef CONFIG_PPC_BOOK3S_64
-	/* Exclusive emergency stack pointer for machine check exception. */
+	/* Exclusive stacks for system reset and machine check exception. */
+	void *nmi_emergency_sp;
 	void *mc_emergency_sp;
+
+	u16 in_nmi;			/* In nmi handler */
+
 	/*
 	 * Flag to check whether we are in machine check early handler
 	 * and already using emergency stack.
 	 */
 	u16 in_mce;
-	u8 hmi_event_available;		 /* HMI event is available */
+	u8 hmi_event_available;		/* HMI event is available */
 #endif
 
 	/* Stuff for accurate time accounting */
@@ -206,23 +220,7 @@ struct paca_struct {
 #endif
 };
 
-#ifdef CONFIG_PPC_BOOK3S
-static inline void copy_mm_to_paca(mm_context_t *context)
-{
-	get_paca()->mm_ctx_id = context->id;
-#ifdef CONFIG_PPC_MM_SLICES
-	get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize;
-	memcpy(&get_paca()->mm_ctx_high_slices_psize,
-	       &context->high_slices_psize, SLICE_ARRAY_SIZE);
-#else
-	get_paca()->mm_ctx_user_psize = context->user_psize;
-	get_paca()->mm_ctx_sllp = context->sllp;
-#endif
-}
-#else
-static inline void copy_mm_to_paca(mm_context_t *context){}
-#endif
-
+extern void copy_mm_to_paca(struct mm_struct *mm);
 extern struct paca_struct *paca;
 extern void initialise_paca(struct paca_struct *new_paca, int cpu);
 extern void setup_paca(struct paca_struct *new_paca);
diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h
index 3e83d2a20b6f..c4d9654bd637 100644
--- a/arch/powerpc/include/asm/page_64.h
+++ b/arch/powerpc/include/asm/page_64.h
@@ -98,21 +98,7 @@ extern u64 ppc64_pft_size;
 #define GET_LOW_SLICE_INDEX(addr)	((addr) >> SLICE_LOW_SHIFT)
 #define GET_HIGH_SLICE_INDEX(addr)	((addr) >> SLICE_HIGH_SHIFT)
 
-/*
- * 1 bit per slice and we have one slice per 1TB
- * Right now we support only 64TB.
- * IF we change this we will have to change the type
- * of high_slices
- */
-#define SLICE_MASK_SIZE 8
-
 #ifndef __ASSEMBLY__
-
-struct slice_mask {
-	u16 low_slices;
-	u64 high_slices;
-};
-
 struct mm_struct;
 
 extern unsigned long slice_get_unmapped_area(unsigned long addr,
diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h
index ae0a23091a9b..723bf48e7494 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -38,6 +38,9 @@ struct power_pmu {
 				unsigned long *valp);
 	int		(*get_alternatives)(u64 event_id, unsigned int flags,
 				u64 alt[]);
+	void		(*get_mem_data_src)(union perf_mem_data_src *dsrc,
+				u32 flags, struct pt_regs *regs);
+	void		(*get_mem_weight)(u64 *weight);
 	u64             (*bhrb_filter_map)(u64 branch_sample_type);
 	void            (*config_bhrb)(u64 pmu_bhrb_filter);
 	void		(*disable_pmc)(unsigned int pmc, unsigned long mmcr[]);
diff --git a/arch/powerpc/include/asm/powernv.h b/arch/powerpc/include/asm/powernv.h
index 0e9c2402dd20..f62797702300 100644
--- a/arch/powerpc/include/asm/powernv.h
+++ b/arch/powerpc/include/asm/powernv.h
@@ -11,9 +11,31 @@
 #define _ASM_POWERNV_H
 
 #ifdef CONFIG_PPC_POWERNV
+#define NPU2_WRITE 1
 extern void powernv_set_nmmu_ptcr(unsigned long ptcr);
+extern struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
+			unsigned long flags,
+			struct npu_context *(*cb)(struct npu_context *, void *),
+			void *priv);
+extern void pnv_npu2_destroy_context(struct npu_context *context,
+				struct pci_dev *gpdev);
+extern int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
+				unsigned long *flags, unsigned long *status,
+				int count);
 #else
 static inline void powernv_set_nmmu_ptcr(unsigned long ptcr) { }
+static inline struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
+			unsigned long flags,
+			struct npu_context *(*cb)(struct npu_context *, void *),
+			void *priv) { return ERR_PTR(-ENODEV); }
+static inline void pnv_npu2_destroy_context(struct npu_context *context,
+					struct pci_dev *gpdev) { }
+
+static inline int pnv_npu2_handle_fault(struct npu_context *context,
+					uintptr_t *ea, unsigned long *flags,
+					unsigned long *status, int count) {
+	return -ENODEV;
+}
 #endif
 
 #endif /* _ASM_POWERNV_H */
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index e7d6d86563ee..142d78d645f4 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -161,6 +161,7 @@
 #define PPC_INST_MFTMR			0x7c0002dc
 #define PPC_INST_MSGSND			0x7c00019c
 #define PPC_INST_MSGCLR			0x7c0001dc
+#define PPC_INST_MSGSYNC		0x7c0006ec
 #define PPC_INST_MSGSNDP		0x7c00011c
 #define PPC_INST_MTTMR			0x7c0003dc
 #define PPC_INST_NOP			0x60000000
@@ -345,6 +346,7 @@
 					___PPC_RB(b) | __PPC_EH(eh))
 #define PPC_MSGSND(b)		stringify_in_c(.long PPC_INST_MSGSND | \
 					___PPC_RB(b))
+#define PPC_MSGSYNC		stringify_in_c(.long PPC_INST_MSGSYNC)
 #define PPC_MSGCLR(b)		stringify_in_c(.long PPC_INST_MSGCLR | \
 					___PPC_RB(b))
 #define PPC_MSGSNDP(b)		stringify_in_c(.long PPC_INST_MSGSNDP | \
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index e0fecbcea2a2..a4b1d8d6b793 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -102,11 +102,25 @@ void release_thread(struct task_struct *);
 #endif
 
 #ifdef CONFIG_PPC64
-/* 64-bit user address space is 46-bits (64TB user VM) */
-#define TASK_SIZE_USER64 (0x0000400000000000UL)
+/*
+ * 64-bit user address space can have multiple limits
+ * For now supported values are:
+ */
+#define TASK_SIZE_64TB  (0x0000400000000000UL)
+#define TASK_SIZE_128TB (0x0000800000000000UL)
+#define TASK_SIZE_512TB (0x0002000000000000UL)
+
+#ifdef CONFIG_PPC_BOOK3S_64
+/*
+ * Max value currently used:
+ */
+#define TASK_SIZE_USER64	TASK_SIZE_512TB
+#else
+#define TASK_SIZE_USER64	TASK_SIZE_64TB
+#endif
 
-/* 
- * 32-bit user address space is 4GB - 1 page 
+/*
+ * 32-bit user address space is 4GB - 1 page
  * (this 1 page is needed so referencing of 0xFFFFFFFF generates EFAULT
  */
 #define TASK_SIZE_USER32 (0x0000000100000000UL - (1*PAGE_SIZE))
@@ -114,26 +128,37 @@ void release_thread(struct task_struct *);
 #define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \
 		TASK_SIZE_USER32 : TASK_SIZE_USER64)
 #define TASK_SIZE	  TASK_SIZE_OF(current)
-
 /* This decides where the kernel will search for a free chunk of vm
  * space during mmap's.
  */
 #define TASK_UNMAPPED_BASE_USER32 (PAGE_ALIGN(TASK_SIZE_USER32 / 4))
-#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(TASK_SIZE_USER64 / 4))
+#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(TASK_SIZE_128TB / 4))
 
 #define TASK_UNMAPPED_BASE ((is_32bit_task()) ? \
 		TASK_UNMAPPED_BASE_USER32 : TASK_UNMAPPED_BASE_USER64 )
 #endif
 
+/*
+ * Initial task size value for user applications. For book3s 64 we start
+ * with 128TB and conditionally enable upto 512TB
+ */
+#ifdef CONFIG_PPC_BOOK3S_64
+#define DEFAULT_MAP_WINDOW	((is_32bit_task()) ? \
+				 TASK_SIZE_USER32 : TASK_SIZE_128TB)
+#else
+#define DEFAULT_MAP_WINDOW	TASK_SIZE
+#endif
+
 #ifdef __powerpc64__
 
-#define STACK_TOP_USER64 TASK_SIZE_USER64
+/* Limit stack to 128TB */
+#define STACK_TOP_USER64 TASK_SIZE_128TB
 #define STACK_TOP_USER32 TASK_SIZE_USER32
 
 #define STACK_TOP (is_32bit_task() ? \
 		   STACK_TOP_USER32 : STACK_TOP_USER64)
 
-#define STACK_TOP_MAX STACK_TOP_USER64
+#define STACK_TOP_MAX TASK_SIZE_USER64
 
 #else /* __powerpc64__ */
 
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index fc879fd6bdae..d4f653c9259a 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -310,6 +310,7 @@
 #define SPRN_PMCR	0x374	/* Power Management Control Register */
 
 /* HFSCR and FSCR bit numbers are the same */
+#define FSCR_SCV_LG	12	/* Enable System Call Vectored */
 #define FSCR_MSGP_LG	10	/* Enable MSGP */
 #define FSCR_TAR_LG	8	/* Enable Target Address Register */
 #define FSCR_EBB_LG	7	/* Enable Event Based Branching */
@@ -320,6 +321,7 @@
 #define FSCR_VECVSX_LG	1	/* Enable VMX/VSX  */
 #define FSCR_FP_LG	0	/* Enable Floating Point */
 #define SPRN_FSCR	0x099	/* Facility Status & Control Register */
+#define   FSCR_SCV	__MASK(FSCR_SCV_LG)
 #define   FSCR_TAR	__MASK(FSCR_TAR_LG)
 #define   FSCR_EBB	__MASK(FSCR_EBB_LG)
 #define   FSCR_DSCR	__MASK(FSCR_DSCR_LG)
@@ -365,6 +367,7 @@
 #define   LPCR_MER_SH		11
 #define	  LPCR_GTSE		ASM_CONST(0x0000000000000400)  	/* Guest Translation Shootdown Enable */
 #define   LPCR_TC		ASM_CONST(0x0000000000000200)	/* Translation control */
+#define   LPCR_HEIC		ASM_CONST(0x0000000000000010)   /* Hypervisor External Interrupt Control */
 #define   LPCR_LPES		0x0000000c
 #define   LPCR_LPES0		ASM_CONST(0x0000000000000008)      /* LPAR Env selector 0 */
 #define   LPCR_LPES1		ASM_CONST(0x0000000000000004)      /* LPAR Env selector 1 */
@@ -656,6 +659,7 @@
 #define   SRR1_ISI_PROT		0x08000000 /* ISI: Other protection fault */
 #define   SRR1_WAKEMASK		0x00380000 /* reason for wakeup */
 #define   SRR1_WAKEMASK_P8	0x003c0000 /* reason for wakeup on POWER8 and 9 */
+#define   SRR1_WAKEMCE_RESVD	0x003c0000 /* Unused/reserved value used by MCE wakeup to indicate cause to idle wakeup handler */
 #define   SRR1_WAKESYSERR	0x00300000 /* System error */
 #define   SRR1_WAKEEE		0x00200000 /* External interrupt */
 #define   SRR1_WAKEHVI		0x00240000 /* Hypervisor Virtualization Interrupt (P9) */
diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h
index 7dc006b58369..7902d6358854 100644
--- a/arch/powerpc/include/asm/sections.h
+++ b/arch/powerpc/include/asm/sections.h
@@ -6,6 +6,8 @@
 #include <linux/uaccess.h>
 #include <asm-generic/sections.h>
 
+extern char __head_end[];
+
 #ifdef __powerpc64__
 
 extern char __start_interrupts[];
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 32db16d2e7ad..ebddb2111d87 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -40,10 +40,12 @@ extern int cpu_to_chip_id(int cpu);
 struct smp_ops_t {
 	void  (*message_pass)(int cpu, int msg);
 #ifdef CONFIG_PPC_SMP_MUXED_IPI
-	void  (*cause_ipi)(int cpu, unsigned long data);
+	void  (*cause_ipi)(int cpu);
 #endif
+	int   (*cause_nmi_ipi)(int cpu);
 	void  (*probe)(void);
 	int   (*kick_cpu)(int nr);
+	int   (*prepare_cpu)(int nr);
 	void  (*setup_cpu)(int nr);
 	void  (*bringup_done)(void);
 	void  (*take_timebase)(void);
@@ -61,7 +63,6 @@ extern void smp_generic_take_timebase(void);
 DECLARE_PER_CPU(unsigned int, cpu_pvr);
 
 #ifdef CONFIG_HOTPLUG_CPU
-extern void migrate_irqs(void);
 int generic_cpu_disable(void);
 void generic_cpu_die(unsigned int cpu);
 void generic_set_cpu_dead(unsigned int cpu);
@@ -112,23 +113,31 @@ extern int cpu_to_core_id(int cpu);
  *
  * Make sure this matches openpic_request_IPIs in open_pic.c, or what shows up
  * in /proc/interrupts will be wrong!!! --Troy */
-#define PPC_MSG_CALL_FUNCTION   0
-#define PPC_MSG_RESCHEDULE      1
+#define PPC_MSG_CALL_FUNCTION	0
+#define PPC_MSG_RESCHEDULE	1
 #define PPC_MSG_TICK_BROADCAST	2
-#define PPC_MSG_DEBUGGER_BREAK  3
+#define PPC_MSG_NMI_IPI		3
 
 /* This is only used by the powernv kernel */
 #define PPC_MSG_RM_HOST_ACTION	4
 
+#define NMI_IPI_ALL_OTHERS		-2
+
+#ifdef CONFIG_NMI_IPI
+extern int smp_handle_nmi_ipi(struct pt_regs *regs);
+#else
+static inline int smp_handle_nmi_ipi(struct pt_regs *regs) { return 0; }
+#endif
+
 /* for irq controllers that have dedicated ipis per message (4) */
 extern int smp_request_message_ipi(int virq, int message);
 extern const char *smp_ipi_name[];
 
 /* for irq controllers with only a single ipi */
-extern void smp_muxed_ipi_set_data(int cpu, unsigned long data);
 extern void smp_muxed_ipi_message_pass(int cpu, int msg);
 extern void smp_muxed_ipi_set_message(int cpu, int msg);
 extern irqreturn_t smp_ipi_demux(void);
+extern irqreturn_t smp_ipi_demux_relaxed(void);
 
 void smp_init_pSeries(void);
 void smp_init_cell(void);
diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h
index 23be8f1e7e64..16fab6898240 100644
--- a/arch/powerpc/include/asm/syscalls.h
+++ b/arch/powerpc/include/asm/syscalls.h
@@ -8,10 +8,10 @@
 
 struct rtas_args;
 
-asmlinkage unsigned long sys_mmap(unsigned long addr, size_t len,
+asmlinkage long sys_mmap(unsigned long addr, size_t len,
 		unsigned long prot, unsigned long flags,
 		unsigned long fd, off_t offset);
-asmlinkage unsigned long sys_mmap2(unsigned long addr, size_t len,
+asmlinkage long sys_mmap2(unsigned long addr, size_t len,
 		unsigned long prot, unsigned long flags,
 		unsigned long fd, unsigned long pgoff);
 asmlinkage long ppc64_personality(unsigned long personality);
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index 6fc6464f7421..a941cc6fc3e9 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -10,15 +10,7 @@
 
 #ifdef __KERNEL__
 
-/* We have 8k stacks on ppc32 and 16k on ppc64 */
-
-#if defined(CONFIG_PPC64)
-#define THREAD_SHIFT		14
-#elif defined(CONFIG_PPC_256K_PAGES)
-#define THREAD_SHIFT		15
-#else
-#define THREAD_SHIFT		13
-#endif
+#define THREAD_SHIFT		CONFIG_THREAD_SHIFT
 
 #define THREAD_SIZE		(1 << THREAD_SHIFT)
 
diff --git a/arch/powerpc/include/asm/xics.h b/arch/powerpc/include/asm/xics.h
index e0b9e576905a..7ce2c3ac2964 100644
--- a/arch/powerpc/include/asm/xics.h
+++ b/arch/powerpc/include/asm/xics.h
@@ -57,7 +57,7 @@ struct icp_ops {
 	void (*teardown_cpu)(void);
 	void (*flush_ipi)(void);
 #ifdef CONFIG_SMP
-	void (*cause_ipi)(int cpu, unsigned long data);
+	void (*cause_ipi)(int cpu);
 	irq_handler_t ipi_action;
 #endif
 };
diff --git a/arch/powerpc/include/asm/xive-regs.h b/arch/powerpc/include/asm/xive-regs.h
new file mode 100644
index 000000000000..1d3f2be5ae39
--- /dev/null
+++ b/arch/powerpc/include/asm/xive-regs.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2016,2017 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef _ASM_POWERPC_XIVE_REGS_H
+#define _ASM_POWERPC_XIVE_REGS_H
+
+/*
+ * Thread Management (aka "TM") registers
+ */
+
+/* TM register offsets */
+#define TM_QW0_USER		0x000 /* All rings */
+#define TM_QW1_OS		0x010 /* Ring 0..2 */
+#define TM_QW2_HV_POOL		0x020 /* Ring 0..1 */
+#define TM_QW3_HV_PHYS		0x030 /* Ring 0..1 */
+
+/* Byte offsets inside a QW             QW0 QW1 QW2 QW3 */
+#define TM_NSR			0x0  /*  +   +   -   +  */
+#define TM_CPPR			0x1  /*  -   +   -   +  */
+#define TM_IPB			0x2  /*  -   +   +   +  */
+#define TM_LSMFB		0x3  /*  -   +   +   +  */
+#define TM_ACK_CNT		0x4  /*  -   +   -   -  */
+#define TM_INC			0x5  /*  -   +   -   +  */
+#define TM_AGE			0x6  /*  -   +   -   +  */
+#define TM_PIPR			0x7  /*  -   +   -   +  */
+
+#define TM_WORD0		0x0
+#define TM_WORD1		0x4
+
+/*
+ * QW word 2 contains the valid bit at the top and other fields
+ * depending on the QW.
+ */
+#define TM_WORD2		0x8
+#define   TM_QW0W2_VU		PPC_BIT32(0)
+#define   TM_QW0W2_LOGIC_SERV	PPC_BITMASK32(1,31) // XX 2,31 ?
+#define   TM_QW1W2_VO		PPC_BIT32(0)
+#define   TM_QW1W2_OS_CAM	PPC_BITMASK32(8,31)
+#define   TM_QW2W2_VP		PPC_BIT32(0)
+#define   TM_QW2W2_POOL_CAM	PPC_BITMASK32(8,31)
+#define   TM_QW3W2_VT		PPC_BIT32(0)
+#define   TM_QW3W2_LP		PPC_BIT32(6)
+#define   TM_QW3W2_LE		PPC_BIT32(7)
+#define   TM_QW3W2_T		PPC_BIT32(31)
+
+/*
+ * In addition to normal loads to "peek" and writes (only when invalid)
+ * using 4 and 8 bytes accesses, the above registers support these
+ * "special" byte operations:
+ *
+ *   - Byte load from QW0[NSR] - User level NSR (EBB)
+ *   - Byte store to QW0[NSR] - User level NSR (EBB)
+ *   - Byte load/store to QW1[CPPR] and QW3[CPPR] - CPPR access
+ *   - Byte load from QW3[TM_WORD2] - Read VT||00000||LP||LE on thrd 0
+ *                                    otherwise VT||0000000
+ *   - Byte store to QW3[TM_WORD2] - Set VT bit (and LP/LE if present)
+ *
+ * Then we have all these "special" CI ops at these offset that trigger
+ * all sorts of side effects:
+ */
+#define TM_SPC_ACK_EBB		0x800	/* Load8 ack EBB to reg*/
+#define TM_SPC_ACK_OS_REG	0x810	/* Load16 ack OS irq to reg */
+#define TM_SPC_PUSH_USR_CTX	0x808	/* Store32 Push/Validate user context */
+#define TM_SPC_PULL_USR_CTX	0x808	/* Load32 Pull/Invalidate user context */
+#define TM_SPC_SET_OS_PENDING	0x812	/* Store8 Set OS irq pending bit */
+#define TM_SPC_PULL_OS_CTX	0x818	/* Load32/Load64 Pull/Invalidate OS context to reg */
+#define TM_SPC_PULL_POOL_CTX	0x828	/* Load32/Load64 Pull/Invalidate Pool context to reg*/
+#define TM_SPC_ACK_HV_REG	0x830	/* Load16 ack HV irq to reg */
+#define TM_SPC_PULL_USR_CTX_OL	0xc08	/* Store8 Pull/Inval usr ctx to odd line */
+#define TM_SPC_ACK_OS_EL	0xc10	/* Store8 ack OS irq to even line */
+#define TM_SPC_ACK_HV_POOL_EL	0xc20	/* Store8 ack HV evt pool to even line */
+#define TM_SPC_ACK_HV_EL	0xc30	/* Store8 ack HV irq to even line */
+/* XXX more... */
+
+/* NSR fields for the various QW ack types */
+#define TM_QW0_NSR_EB		PPC_BIT8(0)
+#define TM_QW1_NSR_EO		PPC_BIT8(0)
+#define TM_QW3_NSR_HE		PPC_BITMASK8(0,1)
+#define  TM_QW3_NSR_HE_NONE	0
+#define  TM_QW3_NSR_HE_POOL	1
+#define  TM_QW3_NSR_HE_PHYS	2
+#define  TM_QW3_NSR_HE_LSI	3
+#define TM_QW3_NSR_I		PPC_BIT8(2)
+#define TM_QW3_NSR_GRP_LVL	PPC_BIT8(3,7)
+
+/* Utilities to manipulate these (originaly from OPAL) */
+#define MASK_TO_LSH(m)		(__builtin_ffsl(m) - 1)
+#define GETFIELD(m, v)		(((v) & (m)) >> MASK_TO_LSH(m))
+#define SETFIELD(m, v, val)				\
+	(((v) & ~(m)) |	((((typeof(v))(val)) << MASK_TO_LSH(m)) & (m)))
+
+#endif /* _ASM_POWERPC_XIVE_REGS_H */
diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
new file mode 100644
index 000000000000..3cdbeaeac397
--- /dev/null
+++ b/arch/powerpc/include/asm/xive.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright 2016,2017 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef _ASM_POWERPC_XIVE_H
+#define _ASM_POWERPC_XIVE_H
+
+#define XIVE_INVALID_VP	0xffffffff
+
+#ifdef CONFIG_PPC_XIVE
+
+/*
+ * Thread Interrupt Management Area (TIMA)
+ *
+ * This is a global MMIO region divided in 4 pages of varying access
+ * permissions, providing access to per-cpu interrupt management
+ * functions. It always identifies the CPU doing the access based
+ * on the PowerBus initiator ID, thus we always access via the
+ * same offset regardless of where the code is executing
+ */
+extern void __iomem *xive_tima;
+
+/*
+ * Offset in the TM area of our current execution level (provided by
+ * the backend)
+ */
+extern u32 xive_tima_offset;
+
+/*
+ * Per-irq data (irq_get_handler_data for normal IRQs), IPIs
+ * have it stored in the xive_cpu structure. We also cache
+ * for normal interrupts the current target CPU.
+ *
+ * This structure is setup by the backend for each interrupt.
+ */
+struct xive_irq_data {
+	u64 flags;
+	u64 eoi_page;
+	void __iomem *eoi_mmio;
+	u64 trig_page;
+	void __iomem *trig_mmio;
+	u32 esb_shift;
+	int src_chip;
+
+	/* Setup/used by frontend */
+	int target;
+	bool saved_p;
+};
+#define XIVE_IRQ_FLAG_STORE_EOI	0x01
+#define XIVE_IRQ_FLAG_LSI	0x02
+#define XIVE_IRQ_FLAG_SHIFT_BUG	0x04
+#define XIVE_IRQ_FLAG_MASK_FW	0x08
+#define XIVE_IRQ_FLAG_EOI_FW	0x10
+
+#define XIVE_INVALID_CHIP_ID	-1
+
+/* A queue tracking structure in a CPU */
+struct xive_q {
+	__be32 			*qpage;
+	u32			msk;
+	u32			idx;
+	u32			toggle;
+	u64			eoi_phys;
+	u32			esc_irq;
+	atomic_t		count;
+	atomic_t		pending_count;
+};
+
+/*
+ * "magic" Event State Buffer (ESB) MMIO offsets.
+ *
+ * Each interrupt source has a 2-bit state machine called ESB
+ * which can be controlled by MMIO. It's made of 2 bits, P and
+ * Q. P indicates that an interrupt is pending (has been sent
+ * to a queue and is waiting for an EOI). Q indicates that the
+ * interrupt has been triggered while pending.
+ *
+ * This acts as a coalescing mechanism in order to guarantee
+ * that a given interrupt only occurs at most once in a queue.
+ *
+ * When doing an EOI, the Q bit will indicate if the interrupt
+ * needs to be re-triggered.
+ *
+ * The following offsets into the ESB MMIO allow to read or
+ * manipulate the PQ bits. They must be used with an 8-bytes
+ * load instruction. They all return the previous state of the
+ * interrupt (atomically).
+ *
+ * Additionally, some ESB pages support doing an EOI via a
+ * store at 0 and some ESBs support doing a trigger via a
+ * separate trigger page.
+ */
+#define XIVE_ESB_GET		0x800
+#define XIVE_ESB_SET_PQ_00	0xc00
+#define XIVE_ESB_SET_PQ_01	0xd00
+#define XIVE_ESB_SET_PQ_10	0xe00
+#define XIVE_ESB_SET_PQ_11	0xf00
+#define XIVE_ESB_MASK		XIVE_ESB_SET_PQ_01
+
+#define XIVE_ESB_VAL_P		0x2
+#define XIVE_ESB_VAL_Q		0x1
+
+/* Global enable flags for the XIVE support */
+extern bool __xive_enabled;
+
+static inline bool xive_enabled(void) { return __xive_enabled; }
+
+extern bool xive_native_init(void);
+extern void xive_smp_probe(void);
+extern int  xive_smp_prepare_cpu(unsigned int cpu);
+extern void xive_smp_setup_cpu(void);
+extern void xive_smp_disable_cpu(void);
+extern void xive_kexec_teardown_cpu(int secondary);
+extern void xive_shutdown(void);
+extern void xive_flush_interrupt(void);
+
+/* xmon hook */
+extern void xmon_xive_do_dump(int cpu);
+
+/* APIs used by KVM */
+extern u32 xive_native_default_eq_shift(void);
+extern u32 xive_native_alloc_vp_block(u32 max_vcpus);
+extern void xive_native_free_vp_block(u32 vp_base);
+extern int xive_native_populate_irq_data(u32 hw_irq,
+					 struct xive_irq_data *data);
+extern void xive_cleanup_irq_data(struct xive_irq_data *xd);
+extern u32 xive_native_alloc_irq(void);
+extern void xive_native_free_irq(u32 irq);
+extern int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
+
+extern int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
+				       __be32 *qpage, u32 order, bool can_escalate);
+extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio);
+
+extern bool __xive_irq_trigger(struct xive_irq_data *xd);
+extern bool __xive_irq_retrigger(struct xive_irq_data *xd);
+extern void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd);
+
+extern bool is_xive_irq(struct irq_chip *chip);
+
+#else
+
+static inline bool xive_enabled(void) { return false; }
+
+static inline bool xive_native_init(void) { return false; }
+static inline void xive_smp_probe(void) { }
+extern inline int  xive_smp_prepare_cpu(unsigned int cpu) { return -EINVAL; }
+static inline void xive_smp_setup_cpu(void) { }
+static inline void xive_smp_disable_cpu(void) { }
+static inline void xive_kexec_teardown_cpu(int secondary) { }
+static inline void xive_shutdown(void) { }
+static inline void xive_flush_interrupt(void) { }
+
+static inline u32 xive_native_alloc_vp_block(u32 max_vcpus) { return XIVE_INVALID_VP; }
+static inline void xive_native_free_vp_block(u32 vp_base) { }
+
+#endif
+
+#endif /* _ASM_POWERPC_XIVE_H */
diff --git a/arch/powerpc/include/asm/xmon.h b/arch/powerpc/include/asm/xmon.h
index 5eb8e599e5cc..eb42a0c6e1d9 100644
--- a/arch/powerpc/include/asm/xmon.h
+++ b/arch/powerpc/include/asm/xmon.h
@@ -29,5 +29,7 @@ static inline void xmon_register_spus(struct list_head *list) { };
 extern int cpus_are_in_xmon(void);
 #endif
 
+extern void xmon_printf(const char *format, ...);
+
 #endif /* __KERNEL __ */
 #endif /* __ASM_POWERPC_XMON_H */
diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h
index 03c06ba7464f..ab45cc2f3101 100644
--- a/arch/powerpc/include/uapi/asm/mman.h
+++ b/arch/powerpc/include/uapi/asm/mman.h
@@ -29,4 +29,20 @@
 #define MAP_STACK	0x20000		/* give out an address that is best suited for process/thread stacks */
 #define MAP_HUGETLB	0x40000		/* create a huge page mapping */
 
+/*
+ * When MAP_HUGETLB is set, bits [26:31] of the flags argument to mmap(2),
+ * encode the log2 of the huge page size. A value of zero indicates that the
+ * default huge page size should be used. To use a non-default huge page size,
+ * one of these defines can be used, or the size can be encoded by hand. Note
+ * that on most systems only a subset, or possibly none, of these sizes will be
+ * available.
+ */
+#define MAP_HUGE_512KB	(19 << MAP_HUGE_SHIFT)	/* 512KB HugeTLB Page */
+#define MAP_HUGE_1MB	(20 << MAP_HUGE_SHIFT)	/* 1MB   HugeTLB Page */
+#define MAP_HUGE_2MB	(21 << MAP_HUGE_SHIFT)	/* 2MB   HugeTLB Page */
+#define MAP_HUGE_8MB	(23 << MAP_HUGE_SHIFT)	/* 8MB   HugeTLB Page */
+#define MAP_HUGE_16MB	(24 << MAP_HUGE_SHIFT)	/* 16MB  HugeTLB Page */
+#define MAP_HUGE_1GB	(30 << MAP_HUGE_SHIFT)	/* 1GB   HugeTLB Page */
+#define MAP_HUGE_16GB	(34 << MAP_HUGE_SHIFT)	/* 16GB  HugeTLB Page */
+
 #endif /* _UAPI_ASM_POWERPC_MMAN_H */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 811f441a125f..b9db46ae545b 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -25,8 +25,6 @@ CFLAGS_REMOVE_cputable.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_prom_init.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_btext.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_prom.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
-# do not trace tracer code
-CFLAGS_REMOVE_ftrace.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
 # timers used by tracing
 CFLAGS_REMOVE_time.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
 endif
@@ -97,6 +95,7 @@ obj-$(CONFIG_BOOTX_TEXT)	+= btext.o
 obj-$(CONFIG_SMP)		+= smp.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_OPTPROBES)		+= optprobes.o optprobes_head.o
+obj-$(CONFIG_KPROBES_ON_FTRACE)	+= kprobes-ftrace.o
 obj-$(CONFIG_UPROBES)		+= uprobes.o
 obj-$(CONFIG_PPC_UDBG_16550)	+= legacy_serial.o udbg_16550.o
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
@@ -118,10 +117,7 @@ obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
 obj-$(CONFIG_PPC_IO_WORKAROUNDS)	+= io-workarounds.o
 
-obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
-obj-$(CONFIG_FTRACE_SYSCALLS)	+= ftrace.o
-obj-$(CONFIG_TRACING)		+= trace_clock.o
+obj-y				+= trace/
 
 ifneq ($(CONFIG_PPC_INDIRECT_PIO),y)
 obj-y				+= iomap.o
@@ -142,14 +138,14 @@ obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvm_emul.o
 # Disable GCOV & sanitizers in odd or sensitive code
 GCOV_PROFILE_prom_init.o := n
 UBSAN_SANITIZE_prom_init.o := n
-GCOV_PROFILE_ftrace.o := n
-UBSAN_SANITIZE_ftrace.o := n
 GCOV_PROFILE_machine_kexec_64.o := n
 UBSAN_SANITIZE_machine_kexec_64.o := n
 GCOV_PROFILE_machine_kexec_32.o := n
 UBSAN_SANITIZE_machine_kexec_32.o := n
 GCOV_PROFILE_kprobes.o := n
 UBSAN_SANITIZE_kprobes.o := n
+GCOV_PROFILE_kprobes-ftrace.o := n
+UBSAN_SANITIZE_kprobes-ftrace.o := n
 UBSAN_SANITIZE_vdso.o := n
 
 extra-$(CONFIG_PPC_FPU)		+= fpu.o
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 4367e7df51a1..439c257dec4a 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -185,6 +185,7 @@ int main(void)
 #ifdef CONFIG_PPC_MM_SLICES
 	OFFSET(PACALOWSLICESPSIZE, paca_struct, mm_ctx_low_slices_psize);
 	OFFSET(PACAHIGHSLICEPSIZE, paca_struct, mm_ctx_high_slices_psize);
+	DEFINE(PACA_ADDR_LIMIT, offsetof(struct paca_struct, addr_limit));
 	DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def));
 #endif /* CONFIG_PPC_MM_SLICES */
 #endif
@@ -219,6 +220,7 @@ int main(void)
 	OFFSET(PACA_EXGEN, paca_struct, exgen);
 	OFFSET(PACA_EXMC, paca_struct, exmc);
 	OFFSET(PACA_EXSLB, paca_struct, exslb);
+	OFFSET(PACA_EXNMI, paca_struct, exnmi);
 	OFFSET(PACALPPACAPTR, paca_struct, lppaca_ptr);
 	OFFSET(PACA_SLBSHADOWPTR, paca_struct, slb_shadow_ptr);
 	OFFSET(SLBSHADOW_STACKVSID, slb_shadow, save_area[SLB_NUM_BOLTED - 1].vsid);
@@ -232,7 +234,9 @@ int main(void)
 	OFFSET(PACAEMERGSP, paca_struct, emergency_sp);
 #ifdef CONFIG_PPC_BOOK3S_64
 	OFFSET(PACAMCEMERGSP, paca_struct, mc_emergency_sp);
+	OFFSET(PACA_NMI_EMERG_SP, paca_struct, nmi_emergency_sp);
 	OFFSET(PACA_IN_MCE, paca_struct, in_mce);
+	OFFSET(PACA_IN_NMI, paca_struct, in_nmi);
 #endif
 	OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id);
 	OFFSET(PACAKEXECSTATE, paca_struct, kexec_state);
@@ -399,8 +403,8 @@ int main(void)
 	DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry));
 #endif
 
-#ifdef MAX_PGD_TABLE_SIZE
-	DEFINE(PGD_TABLE_SIZE, MAX_PGD_TABLE_SIZE);
+#ifdef CONFIG_PPC_BOOK3S_64
+	DEFINE(PGD_TABLE_SIZE, (sizeof(pgd_t) << max(RADIX_PGD_INDEX_SIZE, H_PGD_INDEX_SIZE)));
 #else
 	DEFINE(PGD_TABLE_SIZE, PGD_TABLE_SIZE);
 #endif
@@ -727,6 +731,7 @@ int main(void)
 	OFFSET(PACA_THREAD_IDLE_STATE, paca_struct, thread_idle_state);
 	OFFSET(PACA_THREAD_MASK, paca_struct, thread_mask);
 	OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask);
+	OFFSET(PACA_SIBLING_PACA_PTRS, paca_struct, thread_sibling_pacas);
 #endif
 
 	DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER);
diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S
index 7fe8c79e6937..10cb2896b2ae 100644
--- a/arch/powerpc/kernel/cpu_setup_power.S
+++ b/arch/powerpc/kernel/cpu_setup_power.S
@@ -29,7 +29,8 @@ _GLOBAL(__setup_cpu_power7)
 	li	r0,0
 	mtspr	SPRN_LPID,r0
 	mfspr	r3,SPRN_LPCR
-	bl	__init_LPCR
+	li	r4,(LPCR_LPES1 >> LPCR_LPES_SH)
+	bl	__init_LPCR_ISA206
 	bl	__init_tlb_power7
 	mtlr	r11
 	blr
@@ -42,7 +43,8 @@ _GLOBAL(__restore_cpu_power7)
 	li	r0,0
 	mtspr	SPRN_LPID,r0
 	mfspr	r3,SPRN_LPCR
-	bl	__init_LPCR
+	li	r4,(LPCR_LPES1 >> LPCR_LPES_SH)
+	bl	__init_LPCR_ISA206
 	bl	__init_tlb_power7
 	mtlr	r11
 	blr
@@ -59,7 +61,8 @@ _GLOBAL(__setup_cpu_power8)
 	mtspr	SPRN_LPID,r0
 	mfspr	r3,SPRN_LPCR
 	ori	r3, r3, LPCR_PECEDH
-	bl	__init_LPCR
+	li	r4,0 /* LPES = 0 */
+	bl	__init_LPCR_ISA206
 	bl	__init_HFSCR
 	bl	__init_tlb_power8
 	bl	__init_PMU_HV
@@ -80,7 +83,8 @@ _GLOBAL(__restore_cpu_power8)
 	mtspr	SPRN_LPID,r0
 	mfspr   r3,SPRN_LPCR
 	ori	r3, r3, LPCR_PECEDH
-	bl	__init_LPCR
+	li	r4,0 /* LPES = 0 */
+	bl	__init_LPCR_ISA206
 	bl	__init_HFSCR
 	bl	__init_tlb_power8
 	bl	__init_PMU_HV
@@ -99,11 +103,12 @@ _GLOBAL(__setup_cpu_power9)
 	mtspr	SPRN_PSSCR,r0
 	mtspr	SPRN_LPID,r0
 	mfspr	r3,SPRN_LPCR
-	LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE)
+	LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE  | LPCR_HEIC)
 	or	r3, r3, r4
 	LOAD_REG_IMMEDIATE(r4, LPCR_UPRT | LPCR_HR)
 	andc	r3, r3, r4
-	bl	__init_LPCR
+	li	r4,0 /* LPES = 0 */
+	bl	__init_LPCR_ISA300
 	bl	__init_HFSCR
 	bl	__init_tlb_power9
 	bl	__init_PMU_HV
@@ -122,11 +127,12 @@ _GLOBAL(__restore_cpu_power9)
 	mtspr	SPRN_PSSCR,r0
 	mtspr	SPRN_LPID,r0
 	mfspr   r3,SPRN_LPCR
-	LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE)
+	LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE | LPCR_HEIC)
 	or	r3, r3, r4
 	LOAD_REG_IMMEDIATE(r4, LPCR_UPRT | LPCR_HR)
 	andc	r3, r3, r4
-	bl	__init_LPCR
+	li	r4,0 /* LPES = 0 */
+	bl	__init_LPCR_ISA300
 	bl	__init_HFSCR
 	bl	__init_tlb_power9
 	bl	__init_PMU_HV
@@ -144,9 +150,9 @@ __init_hvmode_206:
 	std	r5,CPU_SPEC_FEATURES(r4)
 	blr
 
-__init_LPCR:
+__init_LPCR_ISA206:
 	/* Setup a sane LPCR:
-	 *   Called with initial LPCR in R3
+	 *   Called with initial LPCR in R3 and desired LPES 2-bit value in R4
 	 *
 	 *   LPES = 0b01 (HSRR0/1 used for 0x500)
 	 *   PECE = 0b111
@@ -157,16 +163,18 @@ __init_LPCR:
 	 *
 	 * Other bits untouched for now
 	 */
-	li	r5,1
-	rldimi	r3,r5, LPCR_LPES_SH, 64-LPCR_LPES_SH-2
+	li	r5,0x10
+	rldimi	r3,r5, LPCR_VRMASD_SH, 64-LPCR_VRMASD_SH-5
+
+	/* POWER9 has no VRMASD */
+__init_LPCR_ISA300:
+	rldimi	r3,r4, LPCR_LPES_SH, 64-LPCR_LPES_SH-2
 	ori	r3,r3,(LPCR_PECE0|LPCR_PECE1|LPCR_PECE2)
 	li	r5,4
 	rldimi	r3,r5, LPCR_DPFD_SH, 64-LPCR_DPFD_SH-3
 	clrrdi	r3,r3,1		/* clear HDICE */
 	li	r5,4
 	rldimi	r3,r5, LPCR_VC_SH, 0
-	li	r5,0x10
-	rldimi	r3,r5, LPCR_VRMASD_SH, 64-LPCR_VRMASD_SH-5
 	mtspr	SPRN_LPCR,r3
 	isync
 	blr
diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c
index 2128f3a96c32..b6fe883b1016 100644
--- a/arch/powerpc/kernel/dbell.c
+++ b/arch/powerpc/kernel/dbell.c
@@ -20,18 +20,60 @@
 #include <asm/kvm_ppc.h>
 
 #ifdef CONFIG_SMP
-void doorbell_setup_this_cpu(void)
+
+/*
+ * Doorbells must only be used if CPU_FTR_DBELL is available.
+ * msgsnd is used in HV, and msgsndp is used in !HV.
+ *
+ * These should be used by platform code that is aware of restrictions.
+ * Other arch code should use ->cause_ipi.
+ *
+ * doorbell_global_ipi() sends a dbell to any target CPU.
+ * Must be used only by architectures that address msgsnd target
+ * by PIR/get_hard_smp_processor_id.
+ */
+void doorbell_global_ipi(int cpu)
 {
-	unsigned long tag = mfspr(SPRN_DOORBELL_CPUTAG) & PPC_DBELL_TAG_MASK;
+	u32 tag = get_hard_smp_processor_id(cpu);
 
-	smp_muxed_ipi_set_data(smp_processor_id(), tag);
+	kvmppc_set_host_ipi(cpu, 1);
+	/* Order previous accesses vs. msgsnd, which is treated as a store */
+	ppc_msgsnd_sync();
+	ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag);
 }
 
-void doorbell_cause_ipi(int cpu, unsigned long data)
+/*
+ * doorbell_core_ipi() sends a dbell to a target CPU in the same core.
+ * Must be used only by architectures that address msgsnd target
+ * by TIR/cpu_thread_in_core.
+ */
+void doorbell_core_ipi(int cpu)
 {
+	u32 tag = cpu_thread_in_core(cpu);
+
+	kvmppc_set_host_ipi(cpu, 1);
 	/* Order previous accesses vs. msgsnd, which is treated as a store */
-	mb();
-	ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, data);
+	ppc_msgsnd_sync();
+	ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag);
+}
+
+/*
+ * Attempt to cause a core doorbell if destination is on the same core.
+ * Returns 1 on success, 0 on failure.
+ */
+int doorbell_try_core_ipi(int cpu)
+{
+	int this_cpu = get_cpu();
+	int ret = 0;
+
+	if (cpumask_test_cpu(cpu, cpu_sibling_mask(this_cpu))) {
+		doorbell_core_ipi(cpu);
+		ret = 1;
+	}
+
+	put_cpu();
+
+	return ret;
 }
 
 void doorbell_exception(struct pt_regs *regs)
@@ -40,12 +82,14 @@ void doorbell_exception(struct pt_regs *regs)
 
 	irq_enter();
 
+	ppc_msgsync();
+
 	may_hard_irq_enable();
 
 	kvmppc_set_host_ipi(smp_processor_id(), 0);
 	__this_cpu_inc(irq_stat.doorbell_irqs);
 
-	smp_ipi_demux();
+	smp_ipi_demux_relaxed(); /* already performed the barrier */
 
 	irq_exit();
 	set_irq_regs(old_regs);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 9de7f79e702b..63992b2d8e15 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -22,7 +22,6 @@
  */
 
 #include <linux/delay.h>
-#include <linux/debugfs.h>
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/list.h>
@@ -37,7 +36,7 @@
 #include <linux/of.h>
 
 #include <linux/atomic.h>
-#include <asm/debug.h>
+#include <asm/debugfs.h>
 #include <asm/eeh.h>
 #include <asm/eeh_event.h>
 #include <asm/io.h>
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index b94887165a10..c405c79e50cd 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -724,7 +724,16 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
  */
 #define MAX_WAIT_FOR_RECOVERY 300
 
-static void eeh_handle_normal_event(struct eeh_pe *pe)
+/**
+ * eeh_handle_normal_event - Handle EEH events on a specific PE
+ * @pe: EEH PE
+ *
+ * Attempts to recover the given PE.  If recovery fails or the PE has failed
+ * too many times, remove the PE.
+ *
+ * Returns true if @pe should no longer be used, else false.
+ */
+static bool eeh_handle_normal_event(struct eeh_pe *pe)
 {
 	struct pci_bus *frozen_bus;
 	struct eeh_dev *edev, *tmp;
@@ -736,13 +745,18 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
 	if (!frozen_bus) {
 		pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n",
 			__func__, pe->phb->global_number, pe->addr);
-		return;
+		return false;
 	}
 
 	eeh_pe_update_time_stamp(pe);
 	pe->freeze_count++;
-	if (pe->freeze_count > eeh_max_freezes)
-		goto excess_failures;
+	if (pe->freeze_count > eeh_max_freezes) {
+		pr_err("EEH: PHB#%x-PE#%x has failed %d times in the\n"
+		       "last hour and has been permanently disabled.\n",
+		       pe->phb->global_number, pe->addr,
+		       pe->freeze_count);
+		goto hard_fail;
+	}
 	pr_warn("EEH: This PCI device has failed %d times in the last hour\n",
 		pe->freeze_count);
 
@@ -870,27 +884,18 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
 	pr_info("EEH: Notify device driver to resume\n");
 	eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
 
-	return;
+	return false;
 
-excess_failures:
+hard_fail:
 	/*
 	 * About 90% of all real-life EEH failures in the field
 	 * are due to poorly seated PCI cards. Only 10% or so are
 	 * due to actual, failed cards.
 	 */
-	pr_err("EEH: PHB#%x-PE#%x has failed %d times in the\n"
-	       "last hour and has been permanently disabled.\n"
-	       "Please try reseating or replacing it.\n",
-		pe->phb->global_number, pe->addr,
-		pe->freeze_count);
-	goto perm_error;
-
-hard_fail:
 	pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n"
 	       "Please try reseating or replacing it\n",
 		pe->phb->global_number, pe->addr);
 
-perm_error:
 	eeh_slot_error_detail(pe, EEH_LOG_PERM);
 
 	/* Notify all devices that they're about to go down. */
@@ -915,10 +920,21 @@ perm_error:
 			pci_lock_rescan_remove();
 			pci_hp_remove_devices(frozen_bus);
 			pci_unlock_rescan_remove();
+
+			/* The passed PE should no longer be used */
+			return true;
 		}
 	}
+	return false;
 }
 
+/**
+ * eeh_handle_special_event - Handle EEH events without a specific failing PE
+ *
+ * Called when an EEH event is detected but can't be narrowed down to a
+ * specific PE.  Iterates through possible failures and handles them as
+ * necessary.
+ */
 static void eeh_handle_special_event(void)
 {
 	struct eeh_pe *pe, *phb_pe;
@@ -982,7 +998,14 @@ static void eeh_handle_special_event(void)
 		 */
 		if (rc == EEH_NEXT_ERR_FROZEN_PE ||
 		    rc == EEH_NEXT_ERR_FENCED_PHB) {
-			eeh_handle_normal_event(pe);
+			/*
+			 * eeh_handle_normal_event() can make the PE stale if it
+			 * determines that the PE cannot possibly be recovered.
+			 * Don't modify the PE state if that's the case.
+			 */
+			if (eeh_handle_normal_event(pe))
+				continue;
+
 			eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
 		} else {
 			pci_lock_rescan_remove();
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index a38600949f3a..8587059ad848 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -31,7 +31,6 @@
 #include <asm/ppc_asm.h>
 #include <asm/asm-offsets.h>
 #include <asm/unistd.h>
-#include <asm/ftrace.h>
 #include <asm/ptrace.h>
 #include <asm/export.h>
 
@@ -1315,109 +1314,3 @@ machine_check_in_rtas:
 	/* XXX load up BATs and panic */
 
 #endif /* CONFIG_PPC_RTAS */
-
-#ifdef CONFIG_FUNCTION_TRACER
-#ifdef CONFIG_DYNAMIC_FTRACE
-_GLOBAL(mcount)
-_GLOBAL(_mcount)
-	/*
-	 * It is required that _mcount on PPC32 must preserve the
-	 * link register. But we have r0 to play with. We use r0
-	 * to push the return address back to the caller of mcount
-	 * into the ctr register, restore the link register and
-	 * then jump back using the ctr register.
-	 */
-	mflr	r0
-	mtctr	r0
-	lwz	r0, 4(r1)
-	mtlr	r0
-	bctr
-
-_GLOBAL(ftrace_caller)
-	MCOUNT_SAVE_FRAME
-	/* r3 ends up with link register */
-	subi	r3, r3, MCOUNT_INSN_SIZE
-.globl ftrace_call
-ftrace_call:
-	bl	ftrace_stub
-	nop
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-.globl ftrace_graph_call
-ftrace_graph_call:
-	b	ftrace_graph_stub
-_GLOBAL(ftrace_graph_stub)
-#endif
-	MCOUNT_RESTORE_FRAME
-	/* old link register ends up in ctr reg */
-	bctr
-#else
-_GLOBAL(mcount)
-_GLOBAL(_mcount)
-
-	MCOUNT_SAVE_FRAME
-
-	subi	r3, r3, MCOUNT_INSN_SIZE
-	LOAD_REG_ADDR(r5, ftrace_trace_function)
-	lwz	r5,0(r5)
-
-	mtctr	r5
-	bctrl
-	nop
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	b	ftrace_graph_caller
-#endif
-	MCOUNT_RESTORE_FRAME
-	bctr
-#endif
-EXPORT_SYMBOL(_mcount)
-
-_GLOBAL(ftrace_stub)
-	blr
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-_GLOBAL(ftrace_graph_caller)
-	/* load r4 with local address */
-	lwz	r4, 44(r1)
-	subi	r4, r4, MCOUNT_INSN_SIZE
-
-	/* Grab the LR out of the caller stack frame */
-	lwz	r3,52(r1)
-
-	bl	prepare_ftrace_return
-	nop
-
-        /*
-         * prepare_ftrace_return gives us the address we divert to.
-         * Change the LR in the callers stack frame to this.
-         */
-	stw	r3,52(r1)
-
-	MCOUNT_RESTORE_FRAME
-	/* old link register ends up in ctr reg */
-	bctr
-
-_GLOBAL(return_to_handler)
-	/* need to save return values */
-	stwu	r1, -32(r1)
-	stw	r3, 20(r1)
-	stw	r4, 16(r1)
-	stw	r31, 12(r1)
-	mr	r31, r1
-
-	bl	ftrace_return_to_handler
-	nop
-
-	/* return value has real return address */
-	mtlr	r3
-
-	lwz	r3, 20(r1)
-	lwz	r4, 16(r1)
-	lwz	r31,12(r1)
-	lwz	r1, 0(r1)
-
-	/* Jump back to real return address */
-	blr
-#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-
-#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 767ef6d68c9e..bfbad08a1207 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -20,7 +20,6 @@
 
 #include <linux/errno.h>
 #include <linux/err.h>
-#include <linux/magic.h>
 #include <asm/unistd.h>
 #include <asm/processor.h>
 #include <asm/page.h>
@@ -33,7 +32,6 @@
 #include <asm/bug.h>
 #include <asm/ptrace.h>
 #include <asm/irqflags.h>
-#include <asm/ftrace.h>
 #include <asm/hw_irq.h>
 #include <asm/context_tracking.h>
 #include <asm/tm.h>
@@ -1173,381 +1171,3 @@ _GLOBAL(enter_prom)
 	ld	r0,16(r1)
 	mtlr    r0
         blr
-
-#ifdef CONFIG_FUNCTION_TRACER
-#ifdef CONFIG_DYNAMIC_FTRACE
-_GLOBAL(mcount)
-_GLOBAL(_mcount)
-EXPORT_SYMBOL(_mcount)
-	mflr	r12
-	mtctr	r12
-	mtlr	r0
-	bctr
-
-#ifndef CC_USING_MPROFILE_KERNEL
-_GLOBAL_TOC(ftrace_caller)
-	/* Taken from output of objdump from lib64/glibc */
-	mflr	r3
-	ld	r11, 0(r1)
-	stdu	r1, -112(r1)
-	std	r3, 128(r1)
-	ld	r4, 16(r11)
-	subi	r3, r3, MCOUNT_INSN_SIZE
-.globl ftrace_call
-ftrace_call:
-	bl	ftrace_stub
-	nop
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-.globl ftrace_graph_call
-ftrace_graph_call:
-	b	ftrace_graph_stub
-_GLOBAL(ftrace_graph_stub)
-#endif
-	ld	r0, 128(r1)
-	mtlr	r0
-	addi	r1, r1, 112
-
-#else /* CC_USING_MPROFILE_KERNEL */
-/*
- *
- * ftrace_caller() is the function that replaces _mcount() when ftrace is
- * active.
- *
- * We arrive here after a function A calls function B, and we are the trace
- * function for B. When we enter r1 points to A's stack frame, B has not yet
- * had a chance to allocate one yet.
- *
- * Additionally r2 may point either to the TOC for A, or B, depending on
- * whether B did a TOC setup sequence before calling us.
- *
- * On entry the LR points back to the _mcount() call site, and r0 holds the
- * saved LR as it was on entry to B, ie. the original return address at the
- * call site in A.
- *
- * Our job is to save the register state into a struct pt_regs (on the stack)
- * and then arrange for the ftrace function to be called.
- */
-_GLOBAL(ftrace_caller)
-	/* Save the original return address in A's stack frame */
-	std	r0,LRSAVE(r1)
-
-	/* Create our stack frame + pt_regs */
-	stdu	r1,-SWITCH_FRAME_SIZE(r1)
-
-	/* Save all gprs to pt_regs */
-	SAVE_8GPRS(0,r1)
-	SAVE_8GPRS(8,r1)
-	SAVE_8GPRS(16,r1)
-	SAVE_8GPRS(24,r1)
-
-	/* Load special regs for save below */
-	mfmsr   r8
-	mfctr   r9
-	mfxer   r10
-	mfcr	r11
-
-	/* Get the _mcount() call site out of LR */
-	mflr	r7
-	/* Save it as pt_regs->nip & pt_regs->link */
-	std     r7, _NIP(r1)
-	std     r7, _LINK(r1)
-
-	/* Save callee's TOC in the ABI compliant location */
-	std	r2, 24(r1)
-	ld	r2,PACATOC(r13)	/* get kernel TOC in r2 */
-
-	addis	r3,r2,function_trace_op@toc@ha
-	addi	r3,r3,function_trace_op@toc@l
-	ld	r5,0(r3)
-
-#ifdef CONFIG_LIVEPATCH
-	mr	r14,r7		/* remember old NIP */
-#endif
-	/* Calculate ip from nip-4 into r3 for call below */
-	subi    r3, r7, MCOUNT_INSN_SIZE
-
-	/* Put the original return address in r4 as parent_ip */
-	mr	r4, r0
-
-	/* Save special regs */
-	std     r8, _MSR(r1)
-	std     r9, _CTR(r1)
-	std     r10, _XER(r1)
-	std     r11, _CCR(r1)
-
-	/* Load &pt_regs in r6 for call below */
-	addi    r6, r1 ,STACK_FRAME_OVERHEAD
-
-	/* ftrace_call(r3, r4, r5, r6) */
-.globl ftrace_call
-ftrace_call:
-	bl	ftrace_stub
-	nop
-
-	/* Load ctr with the possibly modified NIP */
-	ld	r3, _NIP(r1)
-	mtctr	r3
-#ifdef CONFIG_LIVEPATCH
-	cmpd	r14,r3		/* has NIP been altered? */
-#endif
-
-	/* Restore gprs */
-	REST_8GPRS(0,r1)
-	REST_8GPRS(8,r1)
-	REST_8GPRS(16,r1)
-	REST_8GPRS(24,r1)
-
-	/* Restore callee's TOC */
-	ld	r2, 24(r1)
-
-	/* Pop our stack frame */
-	addi r1, r1, SWITCH_FRAME_SIZE
-
-	/* Restore original LR for return to B */
-	ld	r0, LRSAVE(r1)
-	mtlr	r0
-
-#ifdef CONFIG_LIVEPATCH
-        /* Based on the cmpd above, if the NIP was altered handle livepatch */
-	bne-	livepatch_handler
-#endif
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	stdu	r1, -112(r1)
-.globl ftrace_graph_call
-ftrace_graph_call:
-	b	ftrace_graph_stub
-_GLOBAL(ftrace_graph_stub)
-	addi	r1, r1, 112
-#endif
-
-	ld	r0,LRSAVE(r1)	/* restore callee's lr at _mcount site */
-	mtlr	r0
-	bctr			/* jump after _mcount site */
-#endif /* CC_USING_MPROFILE_KERNEL */
-
-_GLOBAL(ftrace_stub)
-	blr
-
-#ifdef CONFIG_LIVEPATCH
-	/*
-	 * This function runs in the mcount context, between two functions. As
-	 * such it can only clobber registers which are volatile and used in
-	 * function linkage.
-	 *
-	 * We get here when a function A, calls another function B, but B has
-	 * been live patched with a new function C.
-	 *
-	 * On entry:
-	 *  - we have no stack frame and can not allocate one
-	 *  - LR points back to the original caller (in A)
-	 *  - CTR holds the new NIP in C
-	 *  - r0 & r12 are free
-	 *
-	 * r0 can't be used as the base register for a DS-form load or store, so
-	 * we temporarily shuffle r1 (stack pointer) into r0 and then put it back.
-	 */
-livepatch_handler:
-	CURRENT_THREAD_INFO(r12, r1)
-
-	/* Save stack pointer into r0 */
-	mr	r0, r1
-
-	/* Allocate 3 x 8 bytes */
-	ld	r1, TI_livepatch_sp(r12)
-	addi	r1, r1, 24
-	std	r1, TI_livepatch_sp(r12)
-
-	/* Save toc & real LR on livepatch stack */
-	std	r2,  -24(r1)
-	mflr	r12
-	std	r12, -16(r1)
-
-	/* Store stack end marker */
-	lis     r12, STACK_END_MAGIC@h
-	ori     r12, r12, STACK_END_MAGIC@l
-	std	r12, -8(r1)
-
-	/* Restore real stack pointer */
-	mr	r1, r0
-
-	/* Put ctr in r12 for global entry and branch there */
-	mfctr	r12
-	bctrl
-
-	/*
-	 * Now we are returning from the patched function to the original
-	 * caller A. We are free to use r0 and r12, and we can use r2 until we
-	 * restore it.
-	 */
-
-	CURRENT_THREAD_INFO(r12, r1)
-
-	/* Save stack pointer into r0 */
-	mr	r0, r1
-
-	ld	r1, TI_livepatch_sp(r12)
-
-	/* Check stack marker hasn't been trashed */
-	lis     r2,  STACK_END_MAGIC@h
-	ori     r2,  r2, STACK_END_MAGIC@l
-	ld	r12, -8(r1)
-1:	tdne	r12, r2
-	EMIT_BUG_ENTRY 1b, __FILE__, __LINE__ - 1, 0
-
-	/* Restore LR & toc from livepatch stack */
-	ld	r12, -16(r1)
-	mtlr	r12
-	ld	r2,  -24(r1)
-
-	/* Pop livepatch stack frame */
-	CURRENT_THREAD_INFO(r12, r0)
-	subi	r1, r1, 24
-	std	r1, TI_livepatch_sp(r12)
-
-	/* Restore real stack pointer */
-	mr	r1, r0
-
-	/* Return to original caller of live patched function */
-	blr
-#endif
-
-
-#else
-_GLOBAL_TOC(_mcount)
-EXPORT_SYMBOL(_mcount)
-	/* Taken from output of objdump from lib64/glibc */
-	mflr	r3
-	ld	r11, 0(r1)
-	stdu	r1, -112(r1)
-	std	r3, 128(r1)
-	ld	r4, 16(r11)
-
-	subi	r3, r3, MCOUNT_INSN_SIZE
-	LOAD_REG_ADDR(r5,ftrace_trace_function)
-	ld	r5,0(r5)
-	ld	r5,0(r5)
-	mtctr	r5
-	bctrl
-	nop
-
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	b	ftrace_graph_caller
-#endif
-	ld	r0, 128(r1)
-	mtlr	r0
-	addi	r1, r1, 112
-_GLOBAL(ftrace_stub)
-	blr
-
-#endif /* CONFIG_DYNAMIC_FTRACE */
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-#ifndef CC_USING_MPROFILE_KERNEL
-_GLOBAL(ftrace_graph_caller)
-	/* load r4 with local address */
-	ld	r4, 128(r1)
-	subi	r4, r4, MCOUNT_INSN_SIZE
-
-	/* Grab the LR out of the caller stack frame */
-	ld	r11, 112(r1)
-	ld	r3, 16(r11)
-
-	bl	prepare_ftrace_return
-	nop
-
-	/*
-	 * prepare_ftrace_return gives us the address we divert to.
-	 * Change the LR in the callers stack frame to this.
-	 */
-	ld	r11, 112(r1)
-	std	r3, 16(r11)
-
-	ld	r0, 128(r1)
-	mtlr	r0
-	addi	r1, r1, 112
-	blr
-
-#else /* CC_USING_MPROFILE_KERNEL */
-_GLOBAL(ftrace_graph_caller)
-	/* with -mprofile-kernel, parameter regs are still alive at _mcount */
-	std	r10, 104(r1)
-	std	r9, 96(r1)
-	std	r8, 88(r1)
-	std	r7, 80(r1)
-	std	r6, 72(r1)
-	std	r5, 64(r1)
-	std	r4, 56(r1)
-	std	r3, 48(r1)
-
-	/* Save callee's TOC in the ABI compliant location */
-	std	r2, 24(r1)
-	ld	r2, PACATOC(r13)	/* get kernel TOC in r2 */
-
-	mfctr	r4		/* ftrace_caller has moved local addr here */
-	std	r4, 40(r1)
-	mflr	r3		/* ftrace_caller has restored LR from stack */
-	subi	r4, r4, MCOUNT_INSN_SIZE
-
-	bl	prepare_ftrace_return
-	nop
-
-	/*
-	 * prepare_ftrace_return gives us the address we divert to.
-	 * Change the LR to this.
-	 */
-	mtlr	r3
-
-	ld	r0, 40(r1)
-	mtctr	r0
-	ld	r10, 104(r1)
-	ld	r9, 96(r1)
-	ld	r8, 88(r1)
-	ld	r7, 80(r1)
-	ld	r6, 72(r1)
-	ld	r5, 64(r1)
-	ld	r4, 56(r1)
-	ld	r3, 48(r1)
-
-	/* Restore callee's TOC */
-	ld	r2, 24(r1)
-
-	addi	r1, r1, 112
-	mflr	r0
-	std	r0, LRSAVE(r1)
-	bctr
-#endif /* CC_USING_MPROFILE_KERNEL */
-
-_GLOBAL(return_to_handler)
-	/* need to save return values */
-	std	r4,  -32(r1)
-	std	r3,  -24(r1)
-	/* save TOC */
-	std	r2,  -16(r1)
-	std	r31, -8(r1)
-	mr	r31, r1
-	stdu	r1, -112(r1)
-
-	/*
-	 * We might be called from a module.
-	 * Switch to our TOC to run inside the core kernel.
-	 */
-	ld	r2, PACATOC(r13)
-
-	bl	ftrace_return_to_handler
-	nop
-
-	/* return value has real return address */
-	mtlr	r3
-
-	ld	r1, 0(r1)
-	ld	r4,  -32(r1)
-	ld	r3,  -24(r1)
-	ld	r2,  -16(r1)
-	ld	r31, -8(r1)
-
-	/* Jump back to real return address */
-	blr
-#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 6353019966e6..a9312b52fe6f 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -116,9 +116,11 @@ EXC_VIRT_NONE(0x4000, 0x100)
 
 EXC_REAL_BEGIN(system_reset, 0x100, 0x100)
 	SET_SCRATCH0(r13)
-	GET_PACA(r13)
-	clrrdi	r13,r13,1 /* Last bit of HSPRG0 is set if waking from winkle */
-	EXCEPTION_PROLOG_PSERIES_PACA(PACA_EXGEN, system_reset_common, EXC_STD,
+	/*
+	 * MSR_RI is not enabled, because PACA_EXNMI and nmi stack is
+	 * being used, so a nested NMI exception would corrupt it.
+	 */
+	EXCEPTION_PROLOG_PSERIES_NORI(PACA_EXNMI, system_reset_common, EXC_STD,
 				 IDLETEST, 0x100)
 
 EXC_REAL_END(system_reset, 0x100, 0x100)
@@ -126,34 +128,37 @@ EXC_VIRT_NONE(0x4100, 0x100)
 
 #ifdef CONFIG_PPC_P7_NAP
 EXC_COMMON_BEGIN(system_reset_idle_common)
-BEGIN_FTR_SECTION
-	GET_PACA(r13) /* Restore HSPRG0 to get the winkle bit in r13 */
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
-	bl	pnv_restore_hyp_resource
+	b	pnv_powersave_wakeup
+#endif
 
-	li	r0,PNV_THREAD_RUNNING
-	stb	r0,PACA_THREAD_IDLE_STATE(r13)	/* Clear thread state */
+EXC_COMMON_BEGIN(system_reset_common)
+	/*
+	 * Increment paca->in_nmi then enable MSR_RI. SLB or MCE will be able
+	 * to recover, but nested NMI will notice in_nmi and not recover
+	 * because of the use of the NMI stack. in_nmi reentrancy is tested in
+	 * system_reset_exception.
+	 */
+	lhz	r10,PACA_IN_NMI(r13)
+	addi	r10,r10,1
+	sth	r10,PACA_IN_NMI(r13)
+	li	r10,MSR_RI
+	mtmsrd 	r10,1
 
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-	li	r0,KVM_HWTHREAD_IN_KERNEL
-	stb	r0,HSTATE_HWTHREAD_STATE(r13)
-	/* Order setting hwthread_state vs. testing hwthread_req */
-	sync
-	lbz	r0,HSTATE_HWTHREAD_REQ(r13)
-	cmpwi	r0,0
-	beq	1f
-	BRANCH_TO_KVM(r10, kvm_start_guest)
-1:
-#endif
+	mr	r10,r1
+	ld	r1,PACA_NMI_EMERG_SP(r13)
+	subi	r1,r1,INT_FRAME_SIZE
+	EXCEPTION_COMMON_NORET_STACK(PACA_EXNMI, 0x100,
+			system_reset, system_reset_exception,
+			ADD_NVGPRS;ADD_RECONCILE)
 
-	/* Return SRR1 from power7_nap() */
-	mfspr	r3,SPRN_SRR1
-	blt	cr3,2f
-	b	pnv_wakeup_loss
-2:	b	pnv_wakeup_noloss
-#endif
+	/*
+	 * The stack is no longer in use, decrement in_nmi.
+	 */
+	lhz	r10,PACA_IN_NMI(r13)
+	subi	r10,r10,1
+	sth	r10,PACA_IN_NMI(r13)
 
-EXC_COMMON(system_reset_common, 0x100, system_reset_exception)
+	b	ret_from_except
 
 #ifdef CONFIG_PPC_PSERIES
 /*
@@ -161,8 +166,9 @@ EXC_COMMON(system_reset_common, 0x100, system_reset_exception)
  */
 TRAMP_REAL_BEGIN(system_reset_fwnmi)
 	SET_SCRATCH0(r13)		/* save r13 */
-	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
-				 NOTEST, 0x100)
+	/* See comment at system_reset exception */
+	EXCEPTION_PROLOG_PSERIES_NORI(PACA_EXNMI, system_reset_common,
+						EXC_STD, NOTEST, 0x100)
 #endif /* CONFIG_PPC_PSERIES */
 
 
@@ -172,14 +178,6 @@ EXC_REAL_BEGIN(machine_check, 0x200, 0x100)
 	 * vector
 	 */
 	SET_SCRATCH0(r13)		/* save r13 */
-	/*
-	 * Running native on arch 2.06 or later, we may wakeup from winkle
-	 * inside machine check. If yes, then last bit of HSPRG0 would be set
-	 * to 1. Hence clear it unconditionally.
-	 */
-	GET_PACA(r13)
-	clrrdi	r13,r13,1
-	SET_PACA(r13)
 	EXCEPTION_PROLOG_0(PACA_EXMC)
 BEGIN_FTR_SECTION
 	b	machine_check_powernv_early
@@ -212,6 +210,12 @@ BEGIN_FTR_SECTION
 	 * NOTE: We are here with MSR_ME=0 (off), which means we risk a
 	 * checkstop if we get another machine check exception before we do
 	 * rfid with MSR_ME=1.
+	 *
+	 * This interrupt can wake directly from idle. If that is the case,
+	 * the machine check is handled then the idle wakeup code is called
+	 * to restore state. In that case, the POWER9 DD1 idle PACA workaround
+	 * is not applied in the early machine check code, which will cause
+	 * bugs.
 	 */
 	mr	r11,r1			/* Save r1 */
 	lhz	r10,PACA_IN_MCE(r13)
@@ -268,20 +272,11 @@ machine_check_fwnmi:
 machine_check_pSeries_0:
 	EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST_PR, 0x200)
 	/*
-	 * The following is essentially EXCEPTION_PROLOG_PSERIES_1 with the
-	 * difference that MSR_RI is not enabled, because PACA_EXMC is being
-	 * used, so nested machine check corrupts it. machine_check_common
-	 * enables MSR_RI.
+	 * MSR_RI is not enabled, because PACA_EXMC is being used, so a
+	 * nested machine check corrupts it. machine_check_common enables
+	 * MSR_RI.
 	 */
-	ld	r10,PACAKMSR(r13)
-	xori	r10,r10,MSR_RI
-	mfspr	r11,SPRN_SRR0
-	LOAD_HANDLER(r12, machine_check_common)
-	mtspr	SPRN_SRR0,r12
-	mfspr	r12,SPRN_SRR1
-	mtspr	SPRN_SRR1,r10
-	rfid
-	b	.	/* prevent speculative execution */
+	EXCEPTION_PROLOG_PSERIES_1_NORI(machine_check_common, EXC_STD)
 
 TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
 
@@ -340,6 +335,37 @@ EXC_COMMON_BEGIN(machine_check_common)
 	/* restore original r1. */			\
 	ld	r1,GPR1(r1)
 
+#ifdef CONFIG_PPC_P7_NAP
+/*
+ * This is an idle wakeup. Low level machine check has already been
+ * done. Queue the event then call the idle code to do the wake up.
+ */
+EXC_COMMON_BEGIN(machine_check_idle_common)
+	bl	machine_check_queue_event
+
+	/*
+	 * We have not used any non-volatile GPRs here, and as a rule
+	 * most exception code including machine check does not.
+	 * Therefore PACA_NAPSTATELOST does not need to be set. Idle
+	 * wakeup will restore volatile registers.
+	 *
+	 * Load the original SRR1 into r3 for pnv_powersave_wakeup_mce.
+	 *
+	 * Then decrement MCE nesting after finishing with the stack.
+	 */
+	ld	r3,_MSR(r1)
+
+	lhz	r11,PACA_IN_MCE(r13)
+	subi	r11,r11,1
+	sth	r11,PACA_IN_MCE(r13)
+
+	/* Turn off the RI bit because SRR1 is used by idle wakeup code. */
+	/* Recoverability could be improved by reducing the use of SRR1. */
+	li	r11,0
+	mtmsrd	r11,1
+
+	b	pnv_powersave_wakeup_mce
+#endif
 	/*
 	 * Handle machine check early in real mode. We come here with
 	 * ME=1, MMU (IR=0 and DR=0) off and using MC emergency stack.
@@ -352,6 +378,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
 	bl	machine_check_early
 	std	r3,RESULT(r1)	/* Save result */
 	ld	r12,_MSR(r1)
+
 #ifdef	CONFIG_PPC_P7_NAP
 	/*
 	 * Check if thread was in power saving mode. We come here when any
@@ -362,48 +389,14 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
 	 *
 	 * Go back to nap/sleep/winkle mode again if (b) is true.
 	 */
-	rlwinm.	r11,r12,47-31,30,31	/* Was it in power saving mode? */
-	beq	4f			/* No, it wasn;t */
-	/* Thread was in power saving mode. Go back to nap again. */
-	cmpwi	r11,2
-	blt	3f
-	/* Supervisor/Hypervisor state loss */
-	li	r0,1
-	stb	r0,PACA_NAPSTATELOST(r13)
-3:	bl	machine_check_queue_event
-	MACHINE_CHECK_HANDLER_WINDUP
-	GET_PACA(r13)
-	ld	r1,PACAR1(r13)
-	/*
-	 * Check what idle state this CPU was in and go back to same mode
-	 * again.
-	 */
-	lbz	r3,PACA_THREAD_IDLE_STATE(r13)
-	cmpwi	r3,PNV_THREAD_NAP
-	bgt	10f
-	IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP)
-	/* No return */
-10:
-	cmpwi	r3,PNV_THREAD_SLEEP
-	bgt	2f
-	IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP)
-	/* No return */
-
-2:
-	/*
-	 * Go back to winkle. Please note that this thread was woken up in
-	 * machine check from winkle and have not restored the per-subcore
-	 * state. Hence before going back to winkle, set last bit of HSPRG0
-	 * to 1. This will make sure that if this thread gets woken up
-	 * again at reset vector 0x100 then it will get chance to restore
-	 * the subcore state.
-	 */
-	ori	r13,r13,1
-	SET_PACA(r13)
-	IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE)
-	/* No return */
+	BEGIN_FTR_SECTION
+	rlwinm.	r11,r12,47-31,30,31
+	beq-	4f
+	BRANCH_TO_COMMON(r10, machine_check_idle_common)
 4:
+	END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 #endif
+
 	/*
 	 * Check if we are coming from hypervisor userspace. If yes then we
 	 * continue in host kernel in V mode to deliver the MC event.
@@ -968,17 +961,12 @@ EXC_VIRT_NONE(0x4e60, 0x20)
 TRAMP_KVM_HV(PACA_EXGEN, 0xe60)
 TRAMP_REAL_BEGIN(hmi_exception_early)
 	EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_HV, 0xe60)
-	mr	r10,r1			/* Save r1			*/
-	ld	r1,PACAEMERGSP(r13)	/* Use emergency stack		*/
+	mr	r10,r1			/* Save r1 */
+	ld	r1,PACAEMERGSP(r13)	/* Use emergency stack for realmode */
 	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame		*/
-	std	r9,_CCR(r1)		/* save CR in stackframe	*/
 	mfspr	r11,SPRN_HSRR0		/* Save HSRR0 */
-	std	r11,_NIP(r1)		/* save HSRR0 in stackframe	*/
-	mfspr	r12,SPRN_HSRR1		/* Save SRR1 */
-	std	r12,_MSR(r1)		/* save SRR1 in stackframe	*/
-	std	r10,0(r1)		/* make stack chain pointer	*/
-	std	r0,GPR0(r1)		/* save r0 in stackframe	*/
-	std	r10,GPR1(r1)		/* save r1 in stackframe	*/
+	mfspr	r12,SPRN_HSRR1		/* Save HSRR1 */
+	EXCEPTION_PROLOG_COMMON_1()
 	EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN)
 	EXCEPTION_PROLOG_COMMON_3(0xe60)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 8ff0dd4e77a7..243dbef7e926 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -30,17 +30,16 @@
 #include <linux/string.h>
 #include <linux/memblock.h>
 #include <linux/delay.h>
-#include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/crash_dump.h>
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
 
+#include <asm/debugfs.h>
 #include <asm/page.h>
 #include <asm/prom.h>
 #include <asm/rtas.h>
 #include <asm/fadump.h>
-#include <asm/debug.h>
 #include <asm/setup.h>
 
 static struct fw_dump fw_dump;
@@ -319,15 +318,34 @@ int __init fadump_reserve_mem(void)
 		pr_debug("fadumphdr_addr = %p\n",
 				(void *) fw_dump.fadumphdr_addr);
 	} else {
-		/* Reserve the memory at the top of memory. */
 		size = get_fadump_area_size();
-		base = memory_boundary - size;
-		memblock_reserve(base, size);
-		printk(KERN_INFO "Reserved %ldMB of memory at %ldMB "
-				"for firmware-assisted dump\n",
-				(unsigned long)(size >> 20),
-				(unsigned long)(base >> 20));
+
+		/*
+		 * Reserve memory at an offset closer to bottom of the RAM to
+		 * minimize the impact of memory hot-remove operation. We can't
+		 * use memblock_find_in_range() here since it doesn't allocate
+		 * from bottom to top.
+		 */
+		for (base = fw_dump.boot_memory_size;
+		     base <= (memory_boundary - size);
+		     base += size) {
+			if (memblock_is_region_memory(base, size) &&
+			    !memblock_is_region_reserved(base, size))
+				break;
+		}
+		if ((base > (memory_boundary - size)) ||
+		    memblock_reserve(base, size)) {
+			pr_err("Failed to reserve memory\n");
+			return 0;
+		}
+
+		pr_info("Reserved %ldMB of memory at %ldMB for firmware-"
+			"assisted dump (System RAM: %ldMB)\n",
+			(unsigned long)(size >> 20),
+			(unsigned long)(base >> 20),
+			(unsigned long)(memblock_phys_mem_size() >> 20));
 	}
+
 	fw_dump.reserve_dump_area_start = base;
 	fw_dump.reserve_dump_area_size = size;
 	return 1;
diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 1607be7c0ef2..e22734278458 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -735,11 +735,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU)
 	EXCEPTION(0x2c00, Trap_2c, unknown_exception, EXC_XFER_EE)
 	EXCEPTION(0x2d00, Trap_2d, unknown_exception, EXC_XFER_EE)
 	EXCEPTION(0x2e00, Trap_2e, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x2f00, MOLTrampoline, unknown_exception, EXC_XFER_EE_LITE)
-
-	.globl mol_trampoline
-	.set mol_trampoline, i0x2f00
-	EXPORT_SYMBOL(mol_trampoline)
+	EXCEPTION(0x2f00, Trap_2f, unknown_exception, EXC_XFER_EE)
 
 	. = 0x3000
 
@@ -1278,16 +1274,6 @@ EXPORT_SYMBOL(empty_zero_page)
 swapper_pg_dir:
 	.space	PGD_TABLE_SIZE
 
-	.globl intercept_table
-intercept_table:
-	.long 0, 0, i0x200, i0x300, i0x400, 0, i0x600, i0x700
-	.long i0x800, 0, 0, 0, 0, i0xd00, 0, 0
-	.long 0, 0, 0, i0x1300, 0, 0, 0, 0
-	.long 0, 0, 0, 0, 0, 0, 0, 0
-	.long 0, 0, 0, 0, 0, 0, 0, 0
-	.long 0, 0, 0, 0, 0, 0, 0, 0
-EXPORT_SYMBOL(intercept_table)
-
 /* Room for two PTE pointers, usually the kernel and current user pointers
  * to their respective root page table.
  */
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 1dc5eae2ced3..0ddc602b33a4 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -949,7 +949,8 @@ start_here_multiplatform:
 	LOAD_REG_ADDR(r3,init_thread_union)
 
 	/* set up a stack pointer */
-	addi	r1,r3,THREAD_SIZE
+	LOAD_REG_IMMEDIATE(r1,THREAD_SIZE)
+	add	r1,r3,r1
 	li	r0,0
 	stdu	r0,-STACK_FRAME_OVERHEAD(r1)
 
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 6fd08219248d..07d4e0ad60db 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -20,6 +20,7 @@
 #include <asm/kvm_book3s_asm.h>
 #include <asm/opal.h>
 #include <asm/cpuidle.h>
+#include <asm/exception-64s.h>
 #include <asm/book3s/64/mmu-hash.h>
 #include <asm/mmu.h>
 
@@ -94,12 +95,12 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
 core_idle_lock_held:
 	HMT_LOW
 3:	lwz	r15,0(r14)
-	andi.   r15,r15,PNV_CORE_IDLE_LOCK_BIT
+	andis.	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
 	bne	3b
 	HMT_MEDIUM
 	lwarx	r15,0,r14
-	andi.	r9,r15,PNV_CORE_IDLE_LOCK_BIT
-	bne	core_idle_lock_held
+	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
+	bne-	core_idle_lock_held
 	blr
 
 /*
@@ -113,7 +114,7 @@ core_idle_lock_held:
  *
  * Address to 'rfid' to in r5
  */
-_GLOBAL(pnv_powersave_common)
+pnv_powersave_common:
 	/* Use r3 to pass state nap/sleep/winkle */
 	/* NAP is a state loss, we create a regs frame on the
 	 * stack, fill it up with the state we care about and
@@ -188,8 +189,8 @@ pnv_enter_arch207_idle_mode:
 	/* The following store to HSTATE_HWTHREAD_STATE(r13)  */
 	/* MUST occur in real mode, i.e. with the MMU off,    */
 	/* and the MMU must stay off until we clear this flag */
-	/* and test HSTATE_HWTHREAD_REQ(r13) in the system    */
-	/* reset interrupt vector in exceptions-64s.S.        */
+	/* and test HSTATE_HWTHREAD_REQ(r13) in               */
+	/* pnv_powersave_wakeup in this file.                 */
 	/* The reason is that another thread can switch the   */
 	/* MMU to a guest context whenever this flag is set   */
 	/* to KVM_HWTHREAD_IN_IDLE, and if the MMU was on,    */
@@ -209,15 +210,20 @@ pnv_enter_arch207_idle_mode:
 	/* Sleep or winkle */
 	lbz	r7,PACA_THREAD_MASK(r13)
 	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
+	li	r5,0
+	beq	cr3,3f
+	lis	r5,PNV_CORE_IDLE_WINKLE_COUNT@h
+3:
 lwarx_loop1:
 	lwarx	r15,0,r14
 
-	andi.   r9,r15,PNV_CORE_IDLE_LOCK_BIT
-	bnel	core_idle_lock_held
+	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
+	bnel-	core_idle_lock_held
 
+	add	r15,r15,r5			/* Add if winkle */
 	andc	r15,r15,r7			/* Clear thread bit */
 
-	andi.	r15,r15,PNV_CORE_IDLE_THREAD_BITS
+	andi.	r9,r15,PNV_CORE_IDLE_THREAD_BITS
 
 /*
  * If cr0 = 0, then current thread is the last thread of the core entering
@@ -240,7 +246,7 @@ common_enter: /* common code for all the threads entering sleep or winkle */
 	IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP)
 
 fastsleep_workaround_at_entry:
-	ori	r15,r15,PNV_CORE_IDLE_LOCK_BIT
+	oris	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
 	stwcx.	r15,0,r14
 	bne-	lwarx_loop1
 	isync
@@ -250,10 +256,10 @@ fastsleep_workaround_at_entry:
 	li	r4,1
 	bl	opal_config_cpu_idle_state
 
-	/* Clear Lock bit */
-	li	r0,0
+	/* Unlock */
+	xoris	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
 	lwsync
-	stw	r0,0(r14)
+	stw	r15,0(r14)
 	b	common_enter
 
 enter_winkle:
@@ -301,8 +307,8 @@ power_enter_stop:
 
 lwarx_loop_stop:
 	lwarx   r15,0,r14
-	andi.   r9,r15,PNV_CORE_IDLE_LOCK_BIT
-	bnel    core_idle_lock_held
+	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
+	bnel-	core_idle_lock_held
 	andc    r15,r15,r7                      /* Clear thread bit */
 
 	stwcx.  r15,0,r14
@@ -375,17 +381,113 @@ _GLOBAL(power9_idle_stop)
 	li	r4,1
 	b	pnv_powersave_common
 	/* No return */
+
 /*
- * Called from reset vector. Check whether we have woken up with
- * hypervisor state loss. If yes, restore hypervisor state and return
- * back to reset vector.
+ * On waking up from stop 0,1,2 with ESL=1 on POWER9 DD1,
+ * HSPRG0 will be set to the HSPRG0 value of one of the
+ * threads in this core. Thus the value we have in r13
+ * may not be this thread's paca pointer.
+ *
+ * Fortunately, the TIR remains invariant. Since this thread's
+ * paca pointer is recorded in all its sibling's paca, we can
+ * correctly recover this thread's paca pointer if we
+ * know the index of this thread in the core.
+ *
+ * This index can be obtained from the TIR.
  *
- * r13 - Contents of HSPRG0
+ * i.e, thread's position in the core = TIR.
+ * If this value is i, then this thread's paca is
+ * paca->thread_sibling_pacas[i].
+ */
+power9_dd1_recover_paca:
+	mfspr	r4, SPRN_TIR
+	/*
+	 * Since each entry in thread_sibling_pacas is 8 bytes
+	 * we need to left-shift by 3 bits. Thus r4 = i * 8
+	 */
+	sldi	r4, r4, 3
+	/* Get &paca->thread_sibling_pacas[0] in r5 */
+	ld	r5, PACA_SIBLING_PACA_PTRS(r13)
+	/* Load paca->thread_sibling_pacas[i] into r13 */
+	ldx	r13, r4, r5
+	SET_PACA(r13)
+	/*
+	 * Indicate that we have lost NVGPR state
+	 * which needs to be restored from the stack.
+	 */
+	li	r3, 1
+	stb	r0,PACA_NAPSTATELOST(r13)
+	blr
+
+/*
+ * Called from machine check handler for powersave wakeups.
+ * Low level machine check processing has already been done. Now just
+ * go through the wake up path to get everything in order.
+ *
+ * r3 - The original SRR1 value.
+ * Original SRR[01] have been clobbered.
+ * MSR_RI is clear.
+ */
+.global pnv_powersave_wakeup_mce
+pnv_powersave_wakeup_mce:
+	/* Set cr3 for pnv_powersave_wakeup */
+	rlwinm	r11,r3,47-31,30,31
+	cmpwi	cr3,r11,2
+
+	/*
+	 * Now put the original SRR1 with SRR1_WAKEMCE_RESVD as the wake
+	 * reason into SRR1, which allows reuse of the system reset wakeup
+	 * code without being mistaken for another type of wakeup.
+	 */
+	oris	r3,r3,SRR1_WAKEMCE_RESVD@h
+	mtspr	SPRN_SRR1,r3
+
+	b	pnv_powersave_wakeup
+
+/*
+ * Called from reset vector for powersave wakeups.
  * cr3 - set to gt if waking up with partial/complete hypervisor state loss
  */
-_GLOBAL(pnv_restore_hyp_resource)
+.global pnv_powersave_wakeup
+pnv_powersave_wakeup:
+	ld	r2, PACATOC(r13)
+
 BEGIN_FTR_SECTION
-	ld	r2,PACATOC(r13);
+BEGIN_FTR_SECTION_NESTED(70)
+	bl	power9_dd1_recover_paca
+END_FTR_SECTION_NESTED_IFSET(CPU_FTR_POWER9_DD1, 70)
+	bl	pnv_restore_hyp_resource_arch300
+FTR_SECTION_ELSE
+	bl	pnv_restore_hyp_resource_arch207
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
+
+	li	r0,PNV_THREAD_RUNNING
+	stb	r0,PACA_THREAD_IDLE_STATE(r13)	/* Clear thread state */
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	li	r0,KVM_HWTHREAD_IN_KERNEL
+	stb	r0,HSTATE_HWTHREAD_STATE(r13)
+	/* Order setting hwthread_state vs. testing hwthread_req */
+	sync
+	lbz	r0,HSTATE_HWTHREAD_REQ(r13)
+	cmpwi	r0,0
+	beq	1f
+	b	kvm_start_guest
+1:
+#endif
+
+	/* Return SRR1 from power7_nap() */
+	mfspr	r3,SPRN_SRR1
+	blt	cr3,pnv_wakeup_noloss
+	b	pnv_wakeup_loss
+
+/*
+ * Check whether we have woken up with hypervisor state loss.
+ * If yes, restore hypervisor state and return back to link.
+ *
+ * cr3 - set to gt if waking up with partial/complete hypervisor state loss
+ */
+pnv_restore_hyp_resource_arch300:
 	/*
 	 * POWER ISA 3. Use PSSCR to determine if we
 	 * are waking up from deep idle state
@@ -400,31 +502,19 @@ BEGIN_FTR_SECTION
 	 */
 	rldicl  r5,r5,4,60
 	cmpd	cr4,r5,r4
-	bge	cr4,pnv_wakeup_tb_loss
-	/*
-	 * Waking up without hypervisor state loss. Return to
-	 * reset vector
-	 */
-	blr
+	bge	cr4,pnv_wakeup_tb_loss /* returns to caller */
 
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+	blr	/* Waking up without hypervisor state loss. */
 
+/* Same calling convention as arch300 */
+pnv_restore_hyp_resource_arch207:
 	/*
 	 * POWER ISA 2.07 or less.
-	 * Check if last bit of HSPGR0 is set. This indicates whether we are
-	 * waking up from winkle.
+	 * Check if we slept with sleep or winkle.
 	 */
-	clrldi	r5,r13,63
-	clrrdi	r13,r13,1
-
-	/* Now that we are sure r13 is corrected, load TOC */
-	ld	r2,PACATOC(r13);
-	cmpwi	cr4,r5,1
-	mtspr	SPRN_HSPRG0,r13
-
-	lbz	r0,PACA_THREAD_IDLE_STATE(r13)
-	cmpwi   cr2,r0,PNV_THREAD_NAP
-	bgt     cr2,pnv_wakeup_tb_loss	/* Either sleep or Winkle */
+	lbz	r4,PACA_THREAD_IDLE_STATE(r13)
+	cmpwi	cr2,r4,PNV_THREAD_NAP
+	bgt	cr2,pnv_wakeup_tb_loss	/* Either sleep or Winkle */
 
 	/*
 	 * We fall through here if PACA_THREAD_IDLE_STATE shows we are waking
@@ -433,8 +523,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 	 */
 	bgt	cr3,.
 
-	blr	/* Return back to System Reset vector from where
-		   pnv_restore_hyp_resource was invoked */
+	blr	/* Waking up without hypervisor state loss */
 
 /*
  * Called if waking up from idle state which can cause either partial or
@@ -444,9 +533,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
  *
  * r13 - PACA
  * cr3 - gt if waking up with partial/complete hypervisor state loss
+ *
+ * If ISA300:
  * cr4 - gt or eq if waking up from complete hypervisor state loss.
+ *
+ * If ISA207:
+ * r4 - PACA_THREAD_IDLE_STATE
  */
-_GLOBAL(pnv_wakeup_tb_loss)
+pnv_wakeup_tb_loss:
 	ld	r1,PACAR1(r13)
 	/*
 	 * Before entering any idle state, the NVGPRs are saved in the stack.
@@ -473,18 +567,19 @@ _GLOBAL(pnv_wakeup_tb_loss)
 	 * is required to return back to reset vector after hypervisor state
 	 * restore is complete.
 	 */
+	mr	r18,r4
 	mflr	r17
 	mfspr	r16,SPRN_SRR1
 BEGIN_FTR_SECTION
 	CHECK_HMI_INTERRUPT
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 
-	lbz	r7,PACA_THREAD_MASK(r13)
 	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
-lwarx_loop2:
-	lwarx	r15,0,r14
-	andi.	r9,r15,PNV_CORE_IDLE_LOCK_BIT
+	lbz	r7,PACA_THREAD_MASK(r13)
+
 	/*
+	 * Take the core lock to synchronize against other threads.
+	 *
 	 * Lock bit is set in one of the 2 cases-
 	 * a. In the sleep/winkle enter path, the last thread is executing
 	 * fastsleep workaround code.
@@ -492,23 +587,93 @@ lwarx_loop2:
 	 * workaround undo code or resyncing timebase or restoring context
 	 * In either case loop until the lock bit is cleared.
 	 */
-	bnel	core_idle_lock_held
+1:
+	lwarx	r15,0,r14
+	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
+	bnel-	core_idle_lock_held
+	oris	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
+	stwcx.	r15,0,r14
+	bne-	1b
+	isync
 
-	cmpwi	cr2,r15,0
+	andi.	r9,r15,PNV_CORE_IDLE_THREAD_BITS
+	cmpwi	cr2,r9,0
 
 	/*
 	 * At this stage
 	 * cr2 - eq if first thread to wakeup in core
 	 * cr3-  gt if waking up with partial/complete hypervisor state loss
+	 * ISA300:
 	 * cr4 - gt or eq if waking up from complete hypervisor state loss.
 	 */
 
-	ori	r15,r15,PNV_CORE_IDLE_LOCK_BIT
-	stwcx.	r15,0,r14
-	bne-	lwarx_loop2
-	isync
-
 BEGIN_FTR_SECTION
+	/*
+	 * Were we in winkle?
+	 * If yes, check if all threads were in winkle, decrement our
+	 * winkle count, set all thread winkle bits if all were in winkle.
+	 * Check if our thread has a winkle bit set, and set cr4 accordingly
+	 * (to match ISA300, above). Pseudo-code for core idle state
+	 * transitions for ISA207 is as follows (everything happens atomically
+	 * due to store conditional and/or lock bit):
+	 *
+	 * nap_idle() { }
+	 * nap_wake() { }
+	 *
+	 * sleep_idle()
+	 * {
+	 *	core_idle_state &= ~thread_in_core
+	 * }
+	 *
+	 * sleep_wake()
+	 * {
+	 *     bool first_in_core, first_in_subcore;
+	 *
+	 *     first_in_core = (core_idle_state & IDLE_THREAD_BITS) == 0;
+	 *     first_in_subcore = (core_idle_state & SUBCORE_SIBLING_MASK) == 0;
+	 *
+	 *     core_idle_state |= thread_in_core;
+	 * }
+	 *
+	 * winkle_idle()
+	 * {
+	 *	core_idle_state &= ~thread_in_core;
+	 *	core_idle_state += 1 << WINKLE_COUNT_SHIFT;
+	 * }
+	 *
+	 * winkle_wake()
+	 * {
+	 *     bool first_in_core, first_in_subcore, winkle_state_lost;
+	 *
+	 *     first_in_core = (core_idle_state & IDLE_THREAD_BITS) == 0;
+	 *     first_in_subcore = (core_idle_state & SUBCORE_SIBLING_MASK) == 0;
+	 *
+	 *     core_idle_state |= thread_in_core;
+	 *
+	 *     if ((core_idle_state & WINKLE_MASK) == (8 << WINKLE_COUNT_SIHFT))
+	 *         core_idle_state |= THREAD_WINKLE_BITS;
+	 *     core_idle_state -= 1 << WINKLE_COUNT_SHIFT;
+	 *
+	 *     winkle_state_lost = core_idle_state &
+	 *				(thread_in_core << WINKLE_THREAD_SHIFT);
+	 *     core_idle_state &= ~(thread_in_core << WINKLE_THREAD_SHIFT);
+	 * }
+	 *
+	 */
+	cmpwi	r18,PNV_THREAD_WINKLE
+	bne	2f
+	andis.	r9,r15,PNV_CORE_IDLE_WINKLE_COUNT_ALL_BIT@h
+	subis	r15,r15,PNV_CORE_IDLE_WINKLE_COUNT@h
+	beq	2f
+	ori	r15,r15,PNV_CORE_IDLE_THREAD_WINKLE_BITS /* all were winkle */
+2:
+	/* Shift thread bit to winkle mask, then test if this thread is set,
+	 * and remove it from the winkle bits */
+	slwi	r8,r7,8
+	and	r8,r8,r15
+	andc	r15,r15,r8
+	cmpwi	cr4,r8,1 /* cr4 will be gt if our bit is set, lt if not */
+
 	lbz	r4,PACA_SUBCORE_SIBLING_MASK(r13)
 	and	r4,r4,r15
 	cmpwi	r4,0	/* Check if first in subcore */
@@ -593,7 +758,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 	mtspr	SPRN_WORC,r4
 
 clear_lock:
-	andi.	r15,r15,PNV_CORE_IDLE_THREAD_BITS
+	xoris	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
 	lwsync
 	stw	r15,0(r14)
 
@@ -651,8 +816,7 @@ hypervisor_state_restored:
 
 	mtspr	SPRN_SRR1,r16
 	mtlr	r17
-	blr	/* Return back to System Reset vector from where
-		   pnv_restore_hyp_resource was invoked */
+	blr		/* return to pnv_powersave_wakeup */
 
 fastsleep_workaround_at_exit:
 	li	r3,1
@@ -664,7 +828,8 @@ fastsleep_workaround_at_exit:
  * R3 here contains the value that will be returned to the caller
  * of power7_nap.
  */
-_GLOBAL(pnv_wakeup_loss)
+.global pnv_wakeup_loss
+pnv_wakeup_loss:
 	ld	r1,PACAR1(r13)
 BEGIN_FTR_SECTION
 	CHECK_HMI_INTERRUPT
@@ -684,7 +849,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
  * R3 here contains the value that will be returned to the caller
  * of power7_nap.
  */
-_GLOBAL(pnv_wakeup_noloss)
+pnv_wakeup_noloss:
 	lbz	r0,PACA_NAPSTATELOST(r13)
 	cmpwi	r0,0
 	bne	pnv_wakeup_loss
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 5f202a566ec5..5a3231fedf08 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -711,13 +711,16 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
 	return tbl;
 }
 
-void iommu_free_table(struct iommu_table *tbl, const char *node_name)
+static void iommu_table_free(struct kref *kref)
 {
 	unsigned long bitmap_sz;
 	unsigned int order;
+	struct iommu_table *tbl;
 
-	if (!tbl)
-		return;
+	tbl = container_of(kref, struct iommu_table, it_kref);
+
+	if (tbl->it_ops->free)
+		tbl->it_ops->free(tbl);
 
 	if (!tbl->it_map) {
 		kfree(tbl);
@@ -733,7 +736,7 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 
 	/* verify that table contains no entries */
 	if (!bitmap_empty(tbl->it_map, tbl->it_size))
-		pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name);
+		pr_warn("%s: Unexpected TCEs\n", __func__);
 
 	/* calculate bitmap size in bytes */
 	bitmap_sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
@@ -746,6 +749,24 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 	kfree(tbl);
 }
 
+struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl)
+{
+	if (kref_get_unless_zero(&tbl->it_kref))
+		return tbl;
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_table_get);
+
+int iommu_tce_table_put(struct iommu_table *tbl)
+{
+	if (WARN_ON(!tbl))
+		return 0;
+
+	return kref_put(&tbl->it_kref, iommu_table_free);
+}
+EXPORT_SYMBOL_GPL(iommu_tce_table_put);
+
 /* Creates TCEs for a user provided buffer.  The user buffer must be
  * contiguous real kernel storage (not vmalloc).  The address passed here
  * comprises a page address and offset into that page. The dma_addr_t
@@ -1004,6 +1025,31 @@ long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
 }
 EXPORT_SYMBOL_GPL(iommu_tce_xchg);
 
+#ifdef CONFIG_PPC_BOOK3S_64
+long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry,
+		unsigned long *hpa, enum dma_data_direction *direction)
+{
+	long ret;
+
+	ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction);
+
+	if (!ret && ((*direction == DMA_FROM_DEVICE) ||
+			(*direction == DMA_BIDIRECTIONAL))) {
+		struct page *pg = realmode_pfn_to_page(*hpa >> PAGE_SHIFT);
+
+		if (likely(pg)) {
+			SetPageDirty(pg);
+		} else {
+			tbl->it_ops->exchange_rm(tbl, entry, hpa, direction);
+			ret = -EFAULT;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_xchg_rm);
+#endif
+
 int iommu_take_ownership(struct iommu_table *tbl)
 {
 	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index a018f5cae899..5c291df30fe3 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -65,7 +65,6 @@
 #include <asm/machdep.h>
 #include <asm/udbg.h>
 #include <asm/smp.h>
-#include <asm/debug.h>
 #include <asm/livepatch.h>
 #include <asm/asm-prototypes.h>
 
@@ -442,46 +441,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 	return sum;
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-void migrate_irqs(void)
-{
-	struct irq_desc *desc;
-	unsigned int irq;
-	static int warned;
-	cpumask_var_t mask;
-	const struct cpumask *map = cpu_online_mask;
-
-	alloc_cpumask_var(&mask, GFP_KERNEL);
-
-	for_each_irq_desc(irq, desc) {
-		struct irq_data *data;
-		struct irq_chip *chip;
-
-		data = irq_desc_get_irq_data(desc);
-		if (irqd_is_per_cpu(data))
-			continue;
-
-		chip = irq_data_get_irq_chip(data);
-
-		cpumask_and(mask, irq_data_get_affinity_mask(data), map);
-		if (cpumask_any(mask) >= nr_cpu_ids) {
-			pr_warn("Breaking affinity for irq %i\n", irq);
-			cpumask_copy(mask, map);
-		}
-		if (chip->irq_set_affinity)
-			chip->irq_set_affinity(data, mask, true);
-		else if (desc->action && !(warned++))
-			pr_err("Cannot set affinity for irq %i\n", irq);
-	}
-
-	free_cpumask_var(mask);
-
-	local_irq_enable();
-	mdelay(1);
-	local_irq_disable();
-}
-#endif
-
 static inline void check_stack_overflow(void)
 {
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c
new file mode 100644
index 000000000000..6c089d9757c9
--- /dev/null
+++ b/arch/powerpc/kernel/kprobes-ftrace.c
@@ -0,0 +1,104 @@
+/*
+ * Dynamic Ftrace based Kprobes Optimization
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) Hitachi Ltd., 2012
+ * Copyright 2016 Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
+ *		  IBM Corporation
+ */
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/hardirq.h>
+#include <linux/preempt.h>
+#include <linux/ftrace.h>
+
+static nokprobe_inline
+int __skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+		      struct kprobe_ctlblk *kcb, unsigned long orig_nip)
+{
+	/*
+	 * Emulate singlestep (and also recover regs->nip)
+	 * as if there is a nop
+	 */
+	regs->nip = (unsigned long)p->addr + MCOUNT_INSN_SIZE;
+	if (unlikely(p->post_handler)) {
+		kcb->kprobe_status = KPROBE_HIT_SSDONE;
+		p->post_handler(p, regs, 0);
+	}
+	__this_cpu_write(current_kprobe, NULL);
+	if (orig_nip)
+		regs->nip = orig_nip;
+	return 1;
+}
+
+int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+		    struct kprobe_ctlblk *kcb)
+{
+	if (kprobe_ftrace(p))
+		return __skip_singlestep(p, regs, kcb, 0);
+	else
+		return 0;
+}
+NOKPROBE_SYMBOL(skip_singlestep);
+
+/* Ftrace callback handler for kprobes */
+void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip,
+			   struct ftrace_ops *ops, struct pt_regs *regs)
+{
+	struct kprobe *p;
+	struct kprobe_ctlblk *kcb;
+	unsigned long flags;
+
+	/* Disable irq for emulating a breakpoint and avoiding preempt */
+	local_irq_save(flags);
+	hard_irq_disable();
+
+	p = get_kprobe((kprobe_opcode_t *)nip);
+	if (unlikely(!p) || kprobe_disabled(p))
+		goto end;
+
+	kcb = get_kprobe_ctlblk();
+	if (kprobe_running()) {
+		kprobes_inc_nmissed_count(p);
+	} else {
+		unsigned long orig_nip = regs->nip;
+
+		/*
+		 * On powerpc, NIP is *before* this instruction for the
+		 * pre handler
+		 */
+		regs->nip -= MCOUNT_INSN_SIZE;
+
+		__this_cpu_write(current_kprobe, p);
+		kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+		if (!p->pre_handler || !p->pre_handler(p, regs))
+			__skip_singlestep(p, regs, kcb, orig_nip);
+		/*
+		 * If pre_handler returns !0, it sets regs->nip and
+		 * resets current kprobe.
+		 */
+	}
+end:
+	local_irq_restore(flags);
+}
+NOKPROBE_SYMBOL(kprobe_ftrace_handler);
+
+int arch_prepare_kprobe_ftrace(struct kprobe *p)
+{
+	p->ainsn.insn = NULL;
+	p->ainsn.boostable = -1;
+	return 0;
+}
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index fce05a38851c..160ae0fa7d0d 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -35,6 +35,7 @@
 #include <asm/code-patching.h>
 #include <asm/cacheflush.h>
 #include <asm/sstep.h>
+#include <asm/sections.h>
 #include <linux/uaccess.h>
 
 DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
@@ -42,7 +43,86 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
 struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}};
 
-int __kprobes arch_prepare_kprobe(struct kprobe *p)
+bool arch_within_kprobe_blacklist(unsigned long addr)
+{
+	return  (addr >= (unsigned long)__kprobes_text_start &&
+		 addr < (unsigned long)__kprobes_text_end) ||
+		(addr >= (unsigned long)_stext &&
+		 addr < (unsigned long)__head_end);
+}
+
+kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset)
+{
+	kprobe_opcode_t *addr;
+
+#ifdef PPC64_ELF_ABI_v2
+	/* PPC64 ABIv2 needs local entry point */
+	addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);
+	if (addr && !offset) {
+#ifdef CONFIG_KPROBES_ON_FTRACE
+		unsigned long faddr;
+		/*
+		 * Per livepatch.h, ftrace location is always within the first
+		 * 16 bytes of a function on powerpc with -mprofile-kernel.
+		 */
+		faddr = ftrace_location_range((unsigned long)addr,
+					      (unsigned long)addr + 16);
+		if (faddr)
+			addr = (kprobe_opcode_t *)faddr;
+		else
+#endif
+			addr = (kprobe_opcode_t *)ppc_function_entry(addr);
+	}
+#elif defined(PPC64_ELF_ABI_v1)
+	/*
+	 * 64bit powerpc ABIv1 uses function descriptors:
+	 * - Check for the dot variant of the symbol first.
+	 * - If that fails, try looking up the symbol provided.
+	 *
+	 * This ensures we always get to the actual symbol and not
+	 * the descriptor.
+	 *
+	 * Also handle <module:symbol> format.
+	 */
+	char dot_name[MODULE_NAME_LEN + 1 + KSYM_NAME_LEN];
+	const char *modsym;
+	bool dot_appended = false;
+	if ((modsym = strchr(name, ':')) != NULL) {
+		modsym++;
+		if (*modsym != '\0' && *modsym != '.') {
+			/* Convert to <module:.symbol> */
+			strncpy(dot_name, name, modsym - name);
+			dot_name[modsym - name] = '.';
+			dot_name[modsym - name + 1] = '\0';
+			strncat(dot_name, modsym,
+				sizeof(dot_name) - (modsym - name) - 2);
+			dot_appended = true;
+		} else {
+			dot_name[0] = '\0';
+			strncat(dot_name, name, sizeof(dot_name) - 1);
+		}
+	} else if (name[0] != '.') {
+		dot_name[0] = '.';
+		dot_name[1] = '\0';
+		strncat(dot_name, name, KSYM_NAME_LEN - 2);
+		dot_appended = true;
+	} else {
+		dot_name[0] = '\0';
+		strncat(dot_name, name, KSYM_NAME_LEN - 1);
+	}
+	addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name);
+	if (!addr && dot_appended) {
+		/* Let's try the original non-dot symbol lookup	*/
+		addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);
+	}
+#else
+	addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);
+#endif
+
+	return addr;
+}
+
+int arch_prepare_kprobe(struct kprobe *p)
 {
 	int ret = 0;
 	kprobe_opcode_t insn = *p->addr;
@@ -74,30 +154,34 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
 	p->ainsn.boostable = 0;
 	return ret;
 }
+NOKPROBE_SYMBOL(arch_prepare_kprobe);
 
-void __kprobes arch_arm_kprobe(struct kprobe *p)
+void arch_arm_kprobe(struct kprobe *p)
 {
 	*p->addr = BREAKPOINT_INSTRUCTION;
 	flush_icache_range((unsigned long) p->addr,
 			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
 }
+NOKPROBE_SYMBOL(arch_arm_kprobe);
 
-void __kprobes arch_disarm_kprobe(struct kprobe *p)
+void arch_disarm_kprobe(struct kprobe *p)
 {
 	*p->addr = p->opcode;
 	flush_icache_range((unsigned long) p->addr,
 			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
 }
+NOKPROBE_SYMBOL(arch_disarm_kprobe);
 
-void __kprobes arch_remove_kprobe(struct kprobe *p)
+void arch_remove_kprobe(struct kprobe *p)
 {
 	if (p->ainsn.insn) {
 		free_insn_slot(p->ainsn.insn, 0);
 		p->ainsn.insn = NULL;
 	}
 }
+NOKPROBE_SYMBOL(arch_remove_kprobe);
 
-static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
+static nokprobe_inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 {
 	enable_single_step(regs);
 
@@ -110,37 +194,80 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 	regs->nip = (unsigned long)p->ainsn.insn;
 }
 
-static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
+static nokprobe_inline void save_previous_kprobe(struct kprobe_ctlblk *kcb)
 {
 	kcb->prev_kprobe.kp = kprobe_running();
 	kcb->prev_kprobe.status = kcb->kprobe_status;
 	kcb->prev_kprobe.saved_msr = kcb->kprobe_saved_msr;
 }
 
-static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+static nokprobe_inline void restore_previous_kprobe(struct kprobe_ctlblk *kcb)
 {
 	__this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
 	kcb->kprobe_status = kcb->prev_kprobe.status;
 	kcb->kprobe_saved_msr = kcb->prev_kprobe.saved_msr;
 }
 
-static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+static nokprobe_inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 				struct kprobe_ctlblk *kcb)
 {
 	__this_cpu_write(current_kprobe, p);
 	kcb->kprobe_saved_msr = regs->msr;
 }
 
-void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
-				      struct pt_regs *regs)
+bool arch_function_offset_within_entry(unsigned long offset)
+{
+#ifdef PPC64_ELF_ABI_v2
+#ifdef CONFIG_KPROBES_ON_FTRACE
+	return offset <= 16;
+#else
+	return offset <= 8;
+#endif
+#else
+	return !offset;
+#endif
+}
+
+void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
 {
 	ri->ret_addr = (kprobe_opcode_t *)regs->link;
 
 	/* Replace the return addr with trampoline addr */
 	regs->link = (unsigned long)kretprobe_trampoline;
 }
+NOKPROBE_SYMBOL(arch_prepare_kretprobe);
 
-int __kprobes kprobe_handler(struct pt_regs *regs)
+int try_to_emulate(struct kprobe *p, struct pt_regs *regs)
+{
+	int ret;
+	unsigned int insn = *p->ainsn.insn;
+
+	/* regs->nip is also adjusted if emulate_step returns 1 */
+	ret = emulate_step(regs, insn);
+	if (ret > 0) {
+		/*
+		 * Once this instruction has been boosted
+		 * successfully, set the boostable flag
+		 */
+		if (unlikely(p->ainsn.boostable == 0))
+			p->ainsn.boostable = 1;
+	} else if (ret < 0) {
+		/*
+		 * We don't allow kprobes on mtmsr(d)/rfi(d), etc.
+		 * So, we should never get here... but, its still
+		 * good to catch them, just in case...
+		 */
+		printk("Can't step on instruction %x\n", insn);
+		BUG();
+	} else if (ret == 0)
+		/* This instruction can't be boosted */
+		p->ainsn.boostable = -1;
+
+	return ret;
+}
+NOKPROBE_SYMBOL(try_to_emulate);
+
+int kprobe_handler(struct pt_regs *regs)
 {
 	struct kprobe *p;
 	int ret = 0;
@@ -177,10 +304,17 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 			 */
 			save_previous_kprobe(kcb);
 			set_current_kprobe(p, regs, kcb);
-			kcb->kprobe_saved_msr = regs->msr;
 			kprobes_inc_nmissed_count(p);
 			prepare_singlestep(p, regs);
 			kcb->kprobe_status = KPROBE_REENTER;
+			if (p->ainsn.boostable >= 0) {
+				ret = try_to_emulate(p, regs);
+
+				if (ret > 0) {
+					restore_previous_kprobe(kcb);
+					return 1;
+				}
+			}
 			return 1;
 		} else {
 			if (*addr != BREAKPOINT_INSTRUCTION) {
@@ -197,7 +331,9 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 			}
 			p = __this_cpu_read(current_kprobe);
 			if (p->break_handler && p->break_handler(p, regs)) {
-				goto ss_probe;
+				if (!skip_singlestep(p, regs, kcb))
+					goto ss_probe;
+				ret = 1;
 			}
 		}
 		goto no_kprobe;
@@ -235,18 +371,9 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 
 ss_probe:
 	if (p->ainsn.boostable >= 0) {
-		unsigned int insn = *p->ainsn.insn;
+		ret = try_to_emulate(p, regs);
 
-		/* regs->nip is also adjusted if emulate_step returns 1 */
-		ret = emulate_step(regs, insn);
 		if (ret > 0) {
-			/*
-			 * Once this instruction has been boosted
-			 * successfully, set the boostable flag
-			 */
-			if (unlikely(p->ainsn.boostable == 0))
-				p->ainsn.boostable = 1;
-
 			if (p->post_handler)
 				p->post_handler(p, regs, 0);
 
@@ -254,17 +381,7 @@ ss_probe:
 			reset_current_kprobe();
 			preempt_enable_no_resched();
 			return 1;
-		} else if (ret < 0) {
-			/*
-			 * We don't allow kprobes on mtmsr(d)/rfi(d), etc.
-			 * So, we should never get here... but, its still
-			 * good to catch them, just in case...
-			 */
-			printk("Can't step on instruction %x\n", insn);
-			BUG();
-		} else if (ret == 0)
-			/* This instruction can't be boosted */
-			p->ainsn.boostable = -1;
+		}
 	}
 	prepare_singlestep(p, regs);
 	kcb->kprobe_status = KPROBE_HIT_SS;
@@ -274,6 +391,7 @@ no_kprobe:
 	preempt_enable_no_resched();
 	return ret;
 }
+NOKPROBE_SYMBOL(kprobe_handler);
 
 /*
  * Function return probe trampoline:
@@ -291,8 +409,7 @@ asm(".global kretprobe_trampoline\n"
 /*
  * Called when the probe at kretprobe trampoline is hit
  */
-static int __kprobes trampoline_probe_handler(struct kprobe *p,
-						struct pt_regs *regs)
+static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kretprobe_instance *ri = NULL;
 	struct hlist_head *head, empty_rp;
@@ -361,6 +478,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
 	 */
 	return 1;
 }
+NOKPROBE_SYMBOL(trampoline_probe_handler);
 
 /*
  * Called after single-stepping.  p->addr is the address of the
@@ -370,7 +488,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
  * single-stepped a copy of the instruction.  The address of this
  * copy is p->ainsn.insn.
  */
-int __kprobes kprobe_post_handler(struct pt_regs *regs)
+int kprobe_post_handler(struct pt_regs *regs)
 {
 	struct kprobe *cur = kprobe_running();
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
@@ -410,8 +528,9 @@ out:
 
 	return 1;
 }
+NOKPROBE_SYMBOL(kprobe_post_handler);
 
-int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 {
 	struct kprobe *cur = kprobe_running();
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
@@ -474,13 +593,15 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 	}
 	return 0;
 }
+NOKPROBE_SYMBOL(kprobe_fault_handler);
 
 unsigned long arch_deref_entry_point(void *entry)
 {
 	return ppc_global_function_entry(entry);
 }
+NOKPROBE_SYMBOL(arch_deref_entry_point);
 
-int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct jprobe *jp = container_of(p, struct jprobe, kp);
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
@@ -497,17 +618,20 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 
 	return 1;
 }
+NOKPROBE_SYMBOL(setjmp_pre_handler);
 
-void __used __kprobes jprobe_return(void)
+void __used jprobe_return(void)
 {
 	asm volatile("trap" ::: "memory");
 }
+NOKPROBE_SYMBOL(jprobe_return);
 
-static void __used __kprobes jprobe_return_end(void)
+static void __used jprobe_return_end(void)
 {
-};
+}
+NOKPROBE_SYMBOL(jprobe_return_end);
 
-int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
@@ -520,6 +644,7 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 	preempt_enable_no_resched();
 	return 1;
 }
+NOKPROBE_SYMBOL(longjmp_break_handler);
 
 static struct kprobe trampoline_p = {
 	.addr = (kprobe_opcode_t *) &kretprobe_trampoline,
@@ -531,10 +656,11 @@ int __init arch_init_kprobes(void)
 	return register_kprobe(&trampoline_p);
 }
 
-int __kprobes arch_trampoline_kprobe(struct kprobe *p)
+int arch_trampoline_kprobe(struct kprobe *p)
 {
 	if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline)
 		return 1;
 
 	return 0;
 }
+NOKPROBE_SYMBOL(arch_trampoline_kprobe);
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index a1475e6aef3a..5f9eada3519b 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -221,6 +221,8 @@ static void machine_check_process_queued_event(struct irq_work *work)
 {
 	int index;
 
+	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
 	/*
 	 * For now just print it to console.
 	 * TODO: log this error event to FSP or nvram.
@@ -228,12 +230,13 @@ static void machine_check_process_queued_event(struct irq_work *work)
 	while (__this_cpu_read(mce_queue_count) > 0) {
 		index = __this_cpu_read(mce_queue_count) - 1;
 		machine_check_print_event_info(
-				this_cpu_ptr(&mce_event_queue[index]));
+				this_cpu_ptr(&mce_event_queue[index]), false);
 		__this_cpu_dec(mce_queue_count);
 	}
 }
 
-void machine_check_print_event_info(struct machine_check_event *evt)
+void machine_check_print_event_info(struct machine_check_event *evt,
+				    bool user_mode)
 {
 	const char *level, *sevstr, *subtype;
 	static const char *mc_ue_types[] = {
@@ -310,7 +313,16 @@ void machine_check_print_event_info(struct machine_check_event *evt)
 
 	printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
 	       evt->disposition == MCE_DISPOSITION_RECOVERED ?
-	       "Recovered" : "[Not recovered");
+	       "Recovered" : "Not recovered");
+
+	if (user_mode) {
+		printk("%s  NIP: [%016llx] PID: %d Comm: %s\n", level,
+			evt->srr0, current->pid, current->comm);
+	} else {
+		printk("%s  NIP [%016llx]: %pS\n", level, evt->srr0,
+		       (void *)evt->srr0);
+	}
+
 	printk("%s  Initiator: %s\n", level,
 	       evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown");
 	switch (evt->error_type) {
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 763d6f58caa8..f913139bb0c2 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -72,10 +72,14 @@ void __flush_tlb_power8(unsigned int action)
 
 void __flush_tlb_power9(unsigned int action)
 {
+	unsigned int num_sets;
+
 	if (radix_enabled())
-		flush_tlb_206(POWER9_TLB_SETS_RADIX, action);
+		num_sets = POWER9_TLB_SETS_RADIX;
+	else
+		num_sets = POWER9_TLB_SETS_HASH;
 
-	flush_tlb_206(POWER9_TLB_SETS_HASH, action);
+	flush_tlb_206(num_sets, action);
 }
 
 
@@ -147,159 +151,365 @@ static int mce_flush(int what)
 	return 0;
 }
 
-static int mce_handle_flush_derrors(uint64_t dsisr, uint64_t slb, uint64_t tlb, uint64_t erat)
-{
-	if ((dsisr & slb) && mce_flush(MCE_FLUSH_SLB))
-		dsisr &= ~slb;
-	if ((dsisr & erat) && mce_flush(MCE_FLUSH_ERAT))
-		dsisr &= ~erat;
-	if ((dsisr & tlb) && mce_flush(MCE_FLUSH_TLB))
-		dsisr &= ~tlb;
-	/* Any other errors we don't understand? */
-	if (dsisr)
-		return 0;
-	return 1;
-}
-
-static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits)
+#define SRR1_MC_LOADSTORE(srr1)	((srr1) & PPC_BIT(42))
+
+struct mce_ierror_table {
+	unsigned long srr1_mask;
+	unsigned long srr1_value;
+	bool nip_valid; /* nip is a valid indicator of faulting address */
+	unsigned int error_type;
+	unsigned int error_subtype;
+	unsigned int initiator;
+	unsigned int severity;
+};
+
+static const struct mce_ierror_table mce_p7_ierror_table[] = {
+{ 0x00000000001c0000, 0x0000000000040000, true,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x0000000000080000, true,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x00000000000c0000, true,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x0000000000100000, true,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x0000000000140000, true,
+  MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x0000000000180000, true,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x00000000001c0000, true,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0, 0, 0, 0, 0, 0 } };
+
+static const struct mce_ierror_table mce_p8_ierror_table[] = {
+{ 0x00000000081c0000, 0x0000000000040000, true,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000080000, true,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x00000000000c0000, true,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000100000, true,
+  MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000140000, true,
+  MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000180000, true,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x00000000001c0000, true,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000008000000, true,
+  MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000008040000, true,
+  MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0, 0, 0, 0, 0, 0 } };
+
+static const struct mce_ierror_table mce_p9_ierror_table[] = {
+{ 0x00000000081c0000, 0x0000000000040000, true,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000080000, true,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x00000000000c0000, true,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000100000, true,
+  MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000140000, true,
+  MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000180000, true,
+  MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000008000000, true,
+  MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000008040000, true,
+  MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x00000000080c0000, true,
+  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000008100000, true,
+  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000008140000, false,
+  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_STORE,
+  MCE_INITIATOR_CPU,  MCE_SEV_FATAL, }, /* ASYNC is fatal */
+{ 0x00000000081c0000, 0x0000000008180000, false,
+  MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_STORE_TIMEOUT,
+  MCE_INITIATOR_CPU,  MCE_SEV_FATAL, }, /* ASYNC is fatal */
+{ 0x00000000081c0000, 0x00000000081c0000, true,
+  MCE_ERROR_TYPE_RA,  MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN,
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0, 0, 0, 0, 0, 0 } };
+
+struct mce_derror_table {
+	unsigned long dsisr_value;
+	bool dar_valid; /* dar is a valid indicator of faulting address */
+	unsigned int error_type;
+	unsigned int error_subtype;
+	unsigned int initiator;
+	unsigned int severity;
+};
+
+static const struct mce_derror_table mce_p7_derror_table[] = {
+{ 0x00008000, false,
+  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00004000, true,
+  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000800, true,
+  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000400, true,
+  MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000100, true,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000080, true,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000040, true,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_INDETERMINATE, /* BOTH */
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0, false, 0, 0, 0, 0 } };
+
+static const struct mce_derror_table mce_p8_derror_table[] = {
+{ 0x00008000, false,
+  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00004000, true,
+  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00002000, true,
+  MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00001000, true,
+  MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000800, true,
+  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000400, true,
+  MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000200, true,
+  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, /* SECONDARY ERAT */
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000100, true,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000080, true,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0, false, 0, 0, 0, 0 } };
+
+static const struct mce_derror_table mce_p9_derror_table[] = {
+{ 0x00008000, false,
+  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_LOAD_STORE,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00004000, true,
+  MCE_ERROR_TYPE_UE,   MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00002000, true,
+  MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00001000, true,
+  MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000800, true,
+  MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000400, true,
+  MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000200, false,
+  MCE_ERROR_TYPE_USER, MCE_USER_ERROR_TLBIE,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000100, true,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000080, true,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000040, true,
+  MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_LOAD,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000020, false,
+  MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000010, false,
+  MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x00000008, false,
+  MCE_ERROR_TYPE_RA,   MCE_RA_ERROR_LOAD_STORE_FOREIGN,
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0, false, 0, 0, 0, 0 } };
+
+static int mce_handle_ierror(struct pt_regs *regs,
+		const struct mce_ierror_table table[],
+		struct mce_error_info *mce_err, uint64_t *addr)
 {
-	long handled = 1;
+	uint64_t srr1 = regs->msr;
+	int handled = 0;
+	int i;
+
+	*addr = 0;
+
+	for (i = 0; table[i].srr1_mask; i++) {
+		if ((srr1 & table[i].srr1_mask) != table[i].srr1_value)
+			continue;
+
+		/* attempt to correct the error */
+		switch (table[i].error_type) {
+		case MCE_ERROR_TYPE_SLB:
+			handled = mce_flush(MCE_FLUSH_SLB);
+			break;
+		case MCE_ERROR_TYPE_ERAT:
+			handled = mce_flush(MCE_FLUSH_ERAT);
+			break;
+		case MCE_ERROR_TYPE_TLB:
+			handled = mce_flush(MCE_FLUSH_TLB);
+			break;
+		}
 
-	/*
-	 * flush and reload SLBs for SLB errors and flush TLBs for TLB errors.
-	 * reset the error bits whenever we handle them so that at the end
-	 * we can check whether we handled all of them or not.
-	 * */
-#ifdef CONFIG_PPC_STD_MMU_64
-	if (dsisr & slb_error_bits) {
-		flush_and_reload_slb();
-		/* reset error bits */
-		dsisr &= ~(slb_error_bits);
-	}
-	if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) {
-		if (cur_cpu_spec && cur_cpu_spec->flush_tlb)
-			cur_cpu_spec->flush_tlb(TLB_INVAL_SCOPE_GLOBAL);
-		/* reset error bits */
-		dsisr &= ~P7_DSISR_MC_TLB_MULTIHIT_MFTLB;
+		/* now fill in mce_error_info */
+		mce_err->error_type = table[i].error_type;
+		switch (table[i].error_type) {
+		case MCE_ERROR_TYPE_UE:
+			mce_err->u.ue_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_SLB:
+			mce_err->u.slb_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_ERAT:
+			mce_err->u.erat_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_TLB:
+			mce_err->u.tlb_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_USER:
+			mce_err->u.user_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_RA:
+			mce_err->u.ra_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_LINK:
+			mce_err->u.link_error_type = table[i].error_subtype;
+			break;
+		}
+		mce_err->severity = table[i].severity;
+		mce_err->initiator = table[i].initiator;
+		if (table[i].nip_valid)
+			*addr = regs->nip;
+		return handled;
 	}
-#endif
-	/* Any other errors we don't understand? */
-	if (dsisr & 0xffffffffUL)
-		handled = 0;
 
-	return handled;
-}
+	mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN;
+	mce_err->severity = MCE_SEV_ERROR_SYNC;
+	mce_err->initiator = MCE_INITIATOR_CPU;
 
-static long mce_handle_derror_p7(uint64_t dsisr)
-{
-	return mce_handle_derror(dsisr, P7_DSISR_MC_SLB_ERRORS);
+	return 0;
 }
 
-static long mce_handle_common_ierror(uint64_t srr1)
+static int mce_handle_derror(struct pt_regs *regs,
+		const struct mce_derror_table table[],
+		struct mce_error_info *mce_err, uint64_t *addr)
 {
-	long handled = 0;
-
-	switch (P7_SRR1_MC_IFETCH(srr1)) {
-	case 0:
-		break;
-#ifdef CONFIG_PPC_STD_MMU_64
-	case P7_SRR1_MC_IFETCH_SLB_PARITY:
-	case P7_SRR1_MC_IFETCH_SLB_MULTIHIT:
-		/* flush and reload SLBs for SLB errors. */
-		flush_and_reload_slb();
-		handled = 1;
-		break;
-	case P7_SRR1_MC_IFETCH_TLB_MULTIHIT:
-		if (cur_cpu_spec && cur_cpu_spec->flush_tlb) {
-			cur_cpu_spec->flush_tlb(TLB_INVAL_SCOPE_GLOBAL);
-			handled = 1;
+	uint64_t dsisr = regs->dsisr;
+	int handled = 0;
+	int found = 0;
+	int i;
+
+	*addr = 0;
+
+	for (i = 0; table[i].dsisr_value; i++) {
+		if (!(dsisr & table[i].dsisr_value))
+			continue;
+
+		/* attempt to correct the error */
+		switch (table[i].error_type) {
+		case MCE_ERROR_TYPE_SLB:
+			if (mce_flush(MCE_FLUSH_SLB))
+				handled = 1;
+			break;
+		case MCE_ERROR_TYPE_ERAT:
+			if (mce_flush(MCE_FLUSH_ERAT))
+				handled = 1;
+			break;
+		case MCE_ERROR_TYPE_TLB:
+			if (mce_flush(MCE_FLUSH_TLB))
+				handled = 1;
+			break;
 		}
-		break;
-#endif
-	default:
-		break;
-	}
 
-	return handled;
-}
-
-static long mce_handle_ierror_p7(uint64_t srr1)
-{
-	long handled = 0;
-
-	handled = mce_handle_common_ierror(srr1);
+		/*
+		 * Attempt to handle multiple conditions, but only return
+		 * one. Ensure uncorrectable errors are first in the table
+		 * to match.
+		 */
+		if (found)
+			continue;
+
+		/* now fill in mce_error_info */
+		mce_err->error_type = table[i].error_type;
+		switch (table[i].error_type) {
+		case MCE_ERROR_TYPE_UE:
+			mce_err->u.ue_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_SLB:
+			mce_err->u.slb_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_ERAT:
+			mce_err->u.erat_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_TLB:
+			mce_err->u.tlb_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_USER:
+			mce_err->u.user_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_RA:
+			mce_err->u.ra_error_type = table[i].error_subtype;
+			break;
+		case MCE_ERROR_TYPE_LINK:
+			mce_err->u.link_error_type = table[i].error_subtype;
+			break;
+		}
+		mce_err->severity = table[i].severity;
+		mce_err->initiator = table[i].initiator;
+		if (table[i].dar_valid)
+			*addr = regs->dar;
 
-#ifdef CONFIG_PPC_STD_MMU_64
-	if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) {
-		flush_and_reload_slb();
-		handled = 1;
+		found = 1;
 	}
-#endif
-	return handled;
-}
 
-static void mce_get_common_ierror(struct mce_error_info *mce_err, uint64_t srr1)
-{
-	switch (P7_SRR1_MC_IFETCH(srr1)) {
-	case P7_SRR1_MC_IFETCH_SLB_PARITY:
-		mce_err->error_type = MCE_ERROR_TYPE_SLB;
-		mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY;
-		break;
-	case P7_SRR1_MC_IFETCH_SLB_MULTIHIT:
-		mce_err->error_type = MCE_ERROR_TYPE_SLB;
-		mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
-		break;
-	case P7_SRR1_MC_IFETCH_TLB_MULTIHIT:
-		mce_err->error_type = MCE_ERROR_TYPE_TLB;
-		mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
-		break;
-	case P7_SRR1_MC_IFETCH_UE:
-	case P7_SRR1_MC_IFETCH_UE_IFU_INTERNAL:
-		mce_err->error_type = MCE_ERROR_TYPE_UE;
-		mce_err->u.ue_error_type = MCE_UE_ERROR_IFETCH;
-		break;
-	case P7_SRR1_MC_IFETCH_UE_TLB_RELOAD:
-		mce_err->error_type = MCE_ERROR_TYPE_UE;
-		mce_err->u.ue_error_type =
-				MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH;
-		break;
-	}
-}
+	if (found)
+		return handled;
 
-static void mce_get_ierror_p7(struct mce_error_info *mce_err, uint64_t srr1)
-{
-	mce_get_common_ierror(mce_err, srr1);
-	if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) {
-		mce_err->error_type = MCE_ERROR_TYPE_SLB;
-		mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
-	}
-}
+	mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN;
+	mce_err->severity = MCE_SEV_ERROR_SYNC;
+	mce_err->initiator = MCE_INITIATOR_CPU;
 
-static void mce_get_derror_p7(struct mce_error_info *mce_err, uint64_t dsisr)
-{
-	if (dsisr & P7_DSISR_MC_UE) {
-		mce_err->error_type = MCE_ERROR_TYPE_UE;
-		mce_err->u.ue_error_type = MCE_UE_ERROR_LOAD_STORE;
-	} else if (dsisr & P7_DSISR_MC_UE_TABLEWALK) {
-		mce_err->error_type = MCE_ERROR_TYPE_UE;
-		mce_err->u.ue_error_type =
-				MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE;
-	} else if (dsisr & P7_DSISR_MC_ERAT_MULTIHIT) {
-		mce_err->error_type = MCE_ERROR_TYPE_ERAT;
-		mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
-	} else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT) {
-		mce_err->error_type = MCE_ERROR_TYPE_SLB;
-		mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
-	} else if (dsisr & P7_DSISR_MC_SLB_PARITY_MFSLB) {
-		mce_err->error_type = MCE_ERROR_TYPE_SLB;
-		mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY;
-	} else if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) {
-		mce_err->error_type = MCE_ERROR_TYPE_TLB;
-		mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
-	} else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT_PARITY) {
-		mce_err->error_type = MCE_ERROR_TYPE_SLB;
-		mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
-	}
+	return 0;
 }
 
 static long mce_handle_ue_error(struct pt_regs *regs)
@@ -320,292 +530,42 @@ static long mce_handle_ue_error(struct pt_regs *regs)
 	return handled;
 }
 
-long __machine_check_early_realmode_p7(struct pt_regs *regs)
+static long mce_handle_error(struct pt_regs *regs,
+		const struct mce_derror_table dtable[],
+		const struct mce_ierror_table itable[])
 {
-	uint64_t srr1, nip, addr;
-	long handled = 1;
-	struct mce_error_info mce_error_info = { 0 };
-
-	mce_error_info.severity = MCE_SEV_ERROR_SYNC;
-	mce_error_info.initiator = MCE_INITIATOR_CPU;
-
-	srr1 = regs->msr;
-	nip = regs->nip;
+	struct mce_error_info mce_err = { 0 };
+	uint64_t addr;
+	uint64_t srr1 = regs->msr;
+	long handled;
 
-	/*
-	 * Handle memory errors depending whether this was a load/store or
-	 * ifetch exception. Also, populate the mce error_type and
-	 * type-specific error_type from either SRR1 or DSISR, depending
-	 * whether this was a load/store or ifetch exception
-	 */
-	if (P7_SRR1_MC_LOADSTORE(srr1)) {
-		handled = mce_handle_derror_p7(regs->dsisr);
-		mce_get_derror_p7(&mce_error_info, regs->dsisr);
-		addr = regs->dar;
-	} else {
-		handled = mce_handle_ierror_p7(srr1);
-		mce_get_ierror_p7(&mce_error_info, srr1);
-		addr = regs->nip;
-	}
+	if (SRR1_MC_LOADSTORE(srr1))
+		handled = mce_handle_derror(regs, dtable, &mce_err, &addr);
+	else
+		handled = mce_handle_ierror(regs, itable, &mce_err, &addr);
 
-	/* Handle UE error. */
-	if (mce_error_info.error_type == MCE_ERROR_TYPE_UE)
+	if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE)
 		handled = mce_handle_ue_error(regs);
 
-	save_mce_event(regs, handled, &mce_error_info, nip, addr);
-	return handled;
-}
-
-static void mce_get_ierror_p8(struct mce_error_info *mce_err, uint64_t srr1)
-{
-	mce_get_common_ierror(mce_err, srr1);
-	if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) {
-		mce_err->error_type = MCE_ERROR_TYPE_ERAT;
-		mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
-	}
-}
-
-static void mce_get_derror_p8(struct mce_error_info *mce_err, uint64_t dsisr)
-{
-	mce_get_derror_p7(mce_err, dsisr);
-	if (dsisr & P8_DSISR_MC_ERAT_MULTIHIT_SEC) {
-		mce_err->error_type = MCE_ERROR_TYPE_ERAT;
-		mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
-	}
-}
-
-static long mce_handle_ierror_p8(uint64_t srr1)
-{
-	long handled = 0;
+	save_mce_event(regs, handled, &mce_err, regs->nip, addr);
 
-	handled = mce_handle_common_ierror(srr1);
-
-#ifdef CONFIG_PPC_STD_MMU_64
-	if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) {
-		flush_and_reload_slb();
-		handled = 1;
-	}
-#endif
 	return handled;
 }
 
-static long mce_handle_derror_p8(uint64_t dsisr)
-{
-	return mce_handle_derror(dsisr, P8_DSISR_MC_SLB_ERRORS);
-}
-
-long __machine_check_early_realmode_p8(struct pt_regs *regs)
-{
-	uint64_t srr1, nip, addr;
-	long handled = 1;
-	struct mce_error_info mce_error_info = { 0 };
-
-	mce_error_info.severity = MCE_SEV_ERROR_SYNC;
-	mce_error_info.initiator = MCE_INITIATOR_CPU;
-
-	srr1 = regs->msr;
-	nip = regs->nip;
-
-	if (P7_SRR1_MC_LOADSTORE(srr1)) {
-		handled = mce_handle_derror_p8(regs->dsisr);
-		mce_get_derror_p8(&mce_error_info, regs->dsisr);
-		addr = regs->dar;
-	} else {
-		handled = mce_handle_ierror_p8(srr1);
-		mce_get_ierror_p8(&mce_error_info, srr1);
-		addr = regs->nip;
-	}
-
-	/* Handle UE error. */
-	if (mce_error_info.error_type == MCE_ERROR_TYPE_UE)
-		handled = mce_handle_ue_error(regs);
-
-	save_mce_event(regs, handled, &mce_error_info, nip, addr);
-	return handled;
-}
-
-static int mce_handle_derror_p9(struct pt_regs *regs)
-{
-	uint64_t dsisr = regs->dsisr;
-
-	return mce_handle_flush_derrors(dsisr,
-			P9_DSISR_MC_SLB_PARITY_MFSLB |
-			P9_DSISR_MC_SLB_MULTIHIT_MFSLB,
-
-			P9_DSISR_MC_TLB_MULTIHIT_MFTLB,
-
-			P9_DSISR_MC_ERAT_MULTIHIT);
-}
-
-static int mce_handle_ierror_p9(struct pt_regs *regs)
+long __machine_check_early_realmode_p7(struct pt_regs *regs)
 {
-	uint64_t srr1 = regs->msr;
+	/* P7 DD1 leaves top bits of DSISR undefined */
+	regs->dsisr &= 0x0000ffff;
 
-	switch (P9_SRR1_MC_IFETCH(srr1)) {
-	case P9_SRR1_MC_IFETCH_SLB_PARITY:
-	case P9_SRR1_MC_IFETCH_SLB_MULTIHIT:
-		return mce_flush(MCE_FLUSH_SLB);
-	case P9_SRR1_MC_IFETCH_TLB_MULTIHIT:
-		return mce_flush(MCE_FLUSH_TLB);
-	case P9_SRR1_MC_IFETCH_ERAT_MULTIHIT:
-		return mce_flush(MCE_FLUSH_ERAT);
-	default:
-		return 0;
-	}
+	return mce_handle_error(regs, mce_p7_derror_table, mce_p7_ierror_table);
 }
 
-static void mce_get_derror_p9(struct pt_regs *regs,
-		struct mce_error_info *mce_err, uint64_t *addr)
-{
-	uint64_t dsisr = regs->dsisr;
-
-	mce_err->severity = MCE_SEV_ERROR_SYNC;
-	mce_err->initiator = MCE_INITIATOR_CPU;
-
-	if (dsisr & P9_DSISR_MC_USER_TLBIE)
-		*addr = regs->nip;
-	else
-		*addr = regs->dar;
-
-	if (dsisr & P9_DSISR_MC_UE) {
-		mce_err->error_type = MCE_ERROR_TYPE_UE;
-		mce_err->u.ue_error_type = MCE_UE_ERROR_LOAD_STORE;
-	} else if (dsisr & P9_DSISR_MC_UE_TABLEWALK) {
-		mce_err->error_type = MCE_ERROR_TYPE_UE;
-		mce_err->u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE;
-	} else if (dsisr & P9_DSISR_MC_LINK_LOAD_TIMEOUT) {
-		mce_err->error_type = MCE_ERROR_TYPE_LINK;
-		mce_err->u.link_error_type = MCE_LINK_ERROR_LOAD_TIMEOUT;
-	} else if (dsisr & P9_DSISR_MC_LINK_TABLEWALK_TIMEOUT) {
-		mce_err->error_type = MCE_ERROR_TYPE_LINK;
-		mce_err->u.link_error_type = MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT;
-	} else if (dsisr & P9_DSISR_MC_ERAT_MULTIHIT) {
-		mce_err->error_type = MCE_ERROR_TYPE_ERAT;
-		mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
-	} else if (dsisr & P9_DSISR_MC_TLB_MULTIHIT_MFTLB) {
-		mce_err->error_type = MCE_ERROR_TYPE_TLB;
-		mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
-	} else if (dsisr & P9_DSISR_MC_USER_TLBIE) {
-		mce_err->error_type = MCE_ERROR_TYPE_USER;
-		mce_err->u.user_error_type = MCE_USER_ERROR_TLBIE;
-	} else if (dsisr & P9_DSISR_MC_SLB_PARITY_MFSLB) {
-		mce_err->error_type = MCE_ERROR_TYPE_SLB;
-		mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY;
-	} else if (dsisr & P9_DSISR_MC_SLB_MULTIHIT_MFSLB) {
-		mce_err->error_type = MCE_ERROR_TYPE_SLB;
-		mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
-	} else if (dsisr & P9_DSISR_MC_RA_LOAD) {
-		mce_err->error_type = MCE_ERROR_TYPE_RA;
-		mce_err->u.ra_error_type = MCE_RA_ERROR_LOAD;
-	} else if (dsisr & P9_DSISR_MC_RA_TABLEWALK) {
-		mce_err->error_type = MCE_ERROR_TYPE_RA;
-		mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE;
-	} else if (dsisr & P9_DSISR_MC_RA_TABLEWALK_FOREIGN) {
-		mce_err->error_type = MCE_ERROR_TYPE_RA;
-		mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN;
-	} else if (dsisr & P9_DSISR_MC_RA_FOREIGN) {
-		mce_err->error_type = MCE_ERROR_TYPE_RA;
-		mce_err->u.ra_error_type = MCE_RA_ERROR_LOAD_STORE_FOREIGN;
-	}
-}
-
-static void mce_get_ierror_p9(struct pt_regs *regs,
-		struct mce_error_info *mce_err, uint64_t *addr)
+long __machine_check_early_realmode_p8(struct pt_regs *regs)
 {
-	uint64_t srr1 = regs->msr;
-
-	switch (P9_SRR1_MC_IFETCH(srr1)) {
-	case P9_SRR1_MC_IFETCH_RA_ASYNC_STORE:
-	case P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT:
-		mce_err->severity = MCE_SEV_FATAL;
-		break;
-	default:
-		mce_err->severity = MCE_SEV_ERROR_SYNC;
-		break;
-	}
-
-	mce_err->initiator = MCE_INITIATOR_CPU;
-
-	*addr = regs->nip;
-
-	switch (P9_SRR1_MC_IFETCH(srr1)) {
-	case P9_SRR1_MC_IFETCH_UE:
-		mce_err->error_type = MCE_ERROR_TYPE_UE;
-		mce_err->u.ue_error_type = MCE_UE_ERROR_IFETCH;
-		break;
-	case P9_SRR1_MC_IFETCH_SLB_PARITY:
-		mce_err->error_type = MCE_ERROR_TYPE_SLB;
-		mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY;
-		break;
-	case P9_SRR1_MC_IFETCH_SLB_MULTIHIT:
-		mce_err->error_type = MCE_ERROR_TYPE_SLB;
-		mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
-		break;
-	case P9_SRR1_MC_IFETCH_ERAT_MULTIHIT:
-		mce_err->error_type = MCE_ERROR_TYPE_ERAT;
-		mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
-		break;
-	case P9_SRR1_MC_IFETCH_TLB_MULTIHIT:
-		mce_err->error_type = MCE_ERROR_TYPE_TLB;
-		mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
-		break;
-	case P9_SRR1_MC_IFETCH_UE_TLB_RELOAD:
-		mce_err->error_type = MCE_ERROR_TYPE_UE;
-		mce_err->u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH;
-		break;
-	case P9_SRR1_MC_IFETCH_LINK_TIMEOUT:
-		mce_err->error_type = MCE_ERROR_TYPE_LINK;
-		mce_err->u.link_error_type = MCE_LINK_ERROR_IFETCH_TIMEOUT;
-		break;
-	case P9_SRR1_MC_IFETCH_LINK_TABLEWALK_TIMEOUT:
-		mce_err->error_type = MCE_ERROR_TYPE_LINK;
-		mce_err->u.link_error_type = MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT;
-		break;
-	case P9_SRR1_MC_IFETCH_RA:
-		mce_err->error_type = MCE_ERROR_TYPE_RA;
-		mce_err->u.ra_error_type = MCE_RA_ERROR_IFETCH;
-		break;
-	case P9_SRR1_MC_IFETCH_RA_TABLEWALK:
-		mce_err->error_type = MCE_ERROR_TYPE_RA;
-		mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH;
-		break;
-	case P9_SRR1_MC_IFETCH_RA_ASYNC_STORE:
-		mce_err->error_type = MCE_ERROR_TYPE_RA;
-		mce_err->u.ra_error_type = MCE_RA_ERROR_STORE;
-		break;
-	case P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT:
-		mce_err->error_type = MCE_ERROR_TYPE_LINK;
-		mce_err->u.link_error_type = MCE_LINK_ERROR_STORE_TIMEOUT;
-		break;
-	case P9_SRR1_MC_IFETCH_RA_TABLEWALK_FOREIGN:
-		mce_err->error_type = MCE_ERROR_TYPE_RA;
-		mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN;
-		break;
-	default:
-		break;
-	}
+	return mce_handle_error(regs, mce_p8_derror_table, mce_p8_ierror_table);
 }
 
 long __machine_check_early_realmode_p9(struct pt_regs *regs)
 {
-	uint64_t nip, addr;
-	long handled;
-	struct mce_error_info mce_error_info = { 0 };
-
-	nip = regs->nip;
-
-	if (P9_SRR1_MC_LOADSTORE(regs->msr)) {
-		handled = mce_handle_derror_p9(regs);
-		mce_get_derror_p9(regs, &mce_error_info, &addr);
-	} else {
-		handled = mce_handle_ierror_p9(regs);
-		mce_get_ierror_p9(regs, &mce_error_info, &addr);
-	}
-
-	/* Handle UE error. */
-	if (mce_error_info.error_type == MCE_ERROR_TYPE_UE)
-		handled = mce_handle_ue_error(regs);
-
-	save_mce_event(regs, handled, &mce_error_info, nip, addr);
-	return handled;
+	return mce_handle_error(regs, mce_p9_derror_table, mce_p9_ierror_table);
 }
diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c
index 2282bf4e63cd..ec60ed0d4aad 100644
--- a/arch/powerpc/kernel/optprobes.c
+++ b/arch/powerpc/kernel/optprobes.c
@@ -243,10 +243,10 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
 	/*
 	 * 2. branch to optimized_callback() and emulate_step()
 	 */
-	kprobe_lookup_name("optimized_callback", op_callback_addr);
-	kprobe_lookup_name("emulate_step", emulate_step_addr);
+	op_callback_addr = (kprobe_opcode_t *)ppc_kallsyms_lookup_name("optimized_callback");
+	emulate_step_addr = (kprobe_opcode_t *)ppc_kallsyms_lookup_name("emulate_step");
 	if (!op_callback_addr || !emulate_step_addr) {
-		WARN(1, "kprobe_lookup_name() failed\n");
+		WARN(1, "Unable to lookup optimized_callback()/emulate_step()\n");
 		goto error;
 	}
 
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index dfc479df9634..8d63627e067f 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -245,3 +245,24 @@ void __init free_unused_pacas(void)
 
 	free_lppacas();
 }
+
+void copy_mm_to_paca(struct mm_struct *mm)
+{
+#ifdef CONFIG_PPC_BOOK3S
+	mm_context_t *context = &mm->context;
+
+	get_paca()->mm_ctx_id = context->id;
+#ifdef CONFIG_PPC_MM_SLICES
+	VM_BUG_ON(!mm->context.addr_limit);
+	get_paca()->addr_limit = mm->context.addr_limit;
+	get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize;
+	memcpy(&get_paca()->mm_ctx_high_slices_psize,
+	       &context->high_slices_psize, TASK_SLICE_ARRAY_SZ(mm));
+#else /* CONFIG_PPC_MM_SLICES */
+	get_paca()->mm_ctx_user_psize = context->user_psize;
+	get_paca()->mm_ctx_sllp = context->sllp;
+#endif
+#else /* CONFIG_PPC_BOOK3S */
+	return;
+#endif
+}
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index f5d399e46193..d2f0afeae5a0 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -55,7 +55,6 @@
 #include <asm/kexec.h>
 #include <asm/opal.h>
 #include <asm/fadump.h>
-#include <asm/debug.h>
 #include <asm/epapr_hcalls.h>
 #include <asm/firmware.h>
 
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 1c1b44ec7642..dd8a04f3053a 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -815,7 +815,7 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = {
 		.virt_base = cpu_to_be32(0xffffffff),
 		.virt_size = cpu_to_be32(0xffffffff),
 		.load_base = cpu_to_be32(0xffffffff),
-		.min_rma = cpu_to_be32(256),		/* 256MB min RMA */
+		.min_rma = cpu_to_be32(512),		/* 512MB min RMA */
 		.min_load = cpu_to_be32(0xffffffff),	/* full client load */
 		.min_rma_percent = 0,	/* min RMA percentage of total RAM */
 		.max_pft_size = 48,	/* max log_2(hash table size) */
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 4697da895133..5c10b5925ac2 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -31,11 +31,11 @@
 #include <linux/unistd.h>
 #include <linux/serial.h>
 #include <linux/serial_8250.h>
-#include <linux/debugfs.h>
 #include <linux/percpu.h>
 #include <linux/memblock.h>
 #include <linux/of_platform.h>
 #include <linux/hugetlb.h>
+#include <asm/debugfs.h>
 #include <asm/io.h>
 #include <asm/paca.h>
 #include <asm/prom.h>
@@ -920,6 +920,15 @@ void __init setup_arch(char **cmdline_p)
 	init_mm.end_code = (unsigned long) _etext;
 	init_mm.end_data = (unsigned long) _edata;
 	init_mm.brk = klimit;
+
+#ifdef CONFIG_PPC_MM_SLICES
+#ifdef CONFIG_PPC64
+	init_mm.context.addr_limit = TASK_SIZE_128TB;
+#else
+#error	"context.addr_limit not initialized."
+#endif
+#endif
+
 #ifdef CONFIG_PPC_64K_PAGES
 	init_mm.context.pte_frag = NULL;
 #endif
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index f997154dfc41..0d4dcaeaafcb 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -230,8 +230,8 @@ static void cpu_ready_for_interrupts(void)
 	 * If we are not in hypervisor mode the job is done once for
 	 * the whole partition in configure_exceptions().
 	 */
-	if (early_cpu_has_feature(CPU_FTR_HVMODE) &&
-	    early_cpu_has_feature(CPU_FTR_ARCH_207S)) {
+	if (cpu_has_feature(CPU_FTR_HVMODE) &&
+	    cpu_has_feature(CPU_FTR_ARCH_207S)) {
 		unsigned long lpcr = mfspr(SPRN_LPCR);
 		mtspr(SPRN_LPCR, lpcr | LPCR_AIL_3);
 	}
@@ -637,6 +637,11 @@ void __init emergency_stack_init(void)
 		paca[i].emergency_sp = (void *)ti + THREAD_SIZE;
 
 #ifdef CONFIG_PPC_BOOK3S_64
+		/* emergency stack for NMI exception handling. */
+		ti = __va(memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit));
+		klp_init_thread_info(ti);
+		paca[i].nmi_emergency_sp = (void *)ti + THREAD_SIZE;
+
 		/* emergency stack for machine check exception handling. */
 		ti = __va(memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit));
 		klp_init_thread_info(ti);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index d68ed1f004a3..df2a41647d8e 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -39,6 +39,7 @@
 #include <asm/irq.h>
 #include <asm/hw_irq.h>
 #include <asm/kvm_ppc.h>
+#include <asm/dbell.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/prom.h>
@@ -86,8 +87,6 @@ volatile unsigned int cpu_callin_map[NR_CPUS];
 
 int smt_enabled_at_boot = 1;
 
-static void (*crash_ipi_function_ptr)(struct pt_regs *) = NULL;
-
 /*
  * Returns 1 if the specified cpu should be brought up during boot.
  * Used to inhibit booting threads if they've been disabled or
@@ -158,32 +157,33 @@ static irqreturn_t tick_broadcast_ipi_action(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static irqreturn_t debug_ipi_action(int irq, void *data)
+#ifdef CONFIG_NMI_IPI
+static irqreturn_t nmi_ipi_action(int irq, void *data)
 {
-	if (crash_ipi_function_ptr) {
-		crash_ipi_function_ptr(get_irq_regs());
-		return IRQ_HANDLED;
-	}
-
-#ifdef CONFIG_DEBUGGER
-	debugger_ipi(get_irq_regs());
-#endif /* CONFIG_DEBUGGER */
-
+	smp_handle_nmi_ipi(get_irq_regs());
 	return IRQ_HANDLED;
 }
+#endif
 
 static irq_handler_t smp_ipi_action[] = {
 	[PPC_MSG_CALL_FUNCTION] =  call_function_action,
 	[PPC_MSG_RESCHEDULE] = reschedule_action,
 	[PPC_MSG_TICK_BROADCAST] = tick_broadcast_ipi_action,
-	[PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action,
+#ifdef CONFIG_NMI_IPI
+	[PPC_MSG_NMI_IPI] = nmi_ipi_action,
+#endif
 };
 
+/*
+ * The NMI IPI is a fallback and not truly non-maskable. It is simpler
+ * than going through the call function infrastructure, and strongly
+ * serialized, so it is more appropriate for debugging.
+ */
 const char *smp_ipi_name[] = {
 	[PPC_MSG_CALL_FUNCTION] =  "ipi call function",
 	[PPC_MSG_RESCHEDULE] = "ipi reschedule",
 	[PPC_MSG_TICK_BROADCAST] = "ipi tick-broadcast",
-	[PPC_MSG_DEBUGGER_BREAK] = "ipi debugger",
+	[PPC_MSG_NMI_IPI] = "nmi ipi",
 };
 
 /* optional function to request ipi, for controllers with >= 4 ipis */
@@ -191,14 +191,13 @@ int smp_request_message_ipi(int virq, int msg)
 {
 	int err;
 
-	if (msg < 0 || msg > PPC_MSG_DEBUGGER_BREAK) {
+	if (msg < 0 || msg > PPC_MSG_NMI_IPI)
 		return -EINVAL;
-	}
-#if !defined(CONFIG_DEBUGGER) && !defined(CONFIG_KEXEC_CORE)
-	if (msg == PPC_MSG_DEBUGGER_BREAK) {
+#ifndef CONFIG_NMI_IPI
+	if (msg == PPC_MSG_NMI_IPI)
 		return 1;
-	}
 #endif
+
 	err = request_irq(virq, smp_ipi_action[msg],
 			  IRQF_PERCPU | IRQF_NO_THREAD | IRQF_NO_SUSPEND,
 			  smp_ipi_name[msg], NULL);
@@ -211,17 +210,9 @@ int smp_request_message_ipi(int virq, int msg)
 #ifdef CONFIG_PPC_SMP_MUXED_IPI
 struct cpu_messages {
 	long messages;			/* current messages */
-	unsigned long data;		/* data for cause ipi */
 };
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_messages, ipi_message);
 
-void smp_muxed_ipi_set_data(int cpu, unsigned long data)
-{
-	struct cpu_messages *info = &per_cpu(ipi_message, cpu);
-
-	info->data = data;
-}
-
 void smp_muxed_ipi_set_message(int cpu, int msg)
 {
 	struct cpu_messages *info = &per_cpu(ipi_message, cpu);
@@ -236,14 +227,13 @@ void smp_muxed_ipi_set_message(int cpu, int msg)
 
 void smp_muxed_ipi_message_pass(int cpu, int msg)
 {
-	struct cpu_messages *info = &per_cpu(ipi_message, cpu);
-
 	smp_muxed_ipi_set_message(cpu, msg);
+
 	/*
 	 * cause_ipi functions are required to include a full barrier
 	 * before doing whatever causes the IPI.
 	 */
-	smp_ops->cause_ipi(cpu, info->data);
+	smp_ops->cause_ipi(cpu);
 }
 
 #ifdef __BIG_ENDIAN__
@@ -254,11 +244,18 @@ void smp_muxed_ipi_message_pass(int cpu, int msg)
 
 irqreturn_t smp_ipi_demux(void)
 {
-	struct cpu_messages *info = this_cpu_ptr(&ipi_message);
-	unsigned long all;
-
 	mb();	/* order any irq clear */
 
+	return smp_ipi_demux_relaxed();
+}
+
+/* sync-free variant. Callers should ensure synchronization */
+irqreturn_t smp_ipi_demux_relaxed(void)
+{
+	struct cpu_messages *info;
+	unsigned long all;
+
+	info = this_cpu_ptr(&ipi_message);
 	do {
 		all = xchg(&info->messages, 0);
 #if defined(CONFIG_KVM_XICS) && defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE)
@@ -278,8 +275,10 @@ irqreturn_t smp_ipi_demux(void)
 			scheduler_ipi();
 		if (all & IPI_MESSAGE(PPC_MSG_TICK_BROADCAST))
 			tick_broadcast_ipi_handler();
-		if (all & IPI_MESSAGE(PPC_MSG_DEBUGGER_BREAK))
-			debug_ipi_action(0, NULL);
+#ifdef CONFIG_NMI_IPI
+		if (all & IPI_MESSAGE(PPC_MSG_NMI_IPI))
+			nmi_ipi_action(0, NULL);
+#endif
 	} while (info->messages);
 
 	return IRQ_HANDLED;
@@ -316,6 +315,187 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask)
 		do_message_pass(cpu, PPC_MSG_CALL_FUNCTION);
 }
 
+#ifdef CONFIG_NMI_IPI
+
+/*
+ * "NMI IPI" system.
+ *
+ * NMI IPIs may not be recoverable, so should not be used as ongoing part of
+ * a running system. They can be used for crash, debug, halt/reboot, etc.
+ *
+ * NMI IPIs are globally single threaded. No more than one in progress at
+ * any time.
+ *
+ * The IPI call waits with interrupts disabled until all targets enter the
+ * NMI handler, then the call returns.
+ *
+ * No new NMI can be initiated until targets exit the handler.
+ *
+ * The IPI call may time out without all targets entering the NMI handler.
+ * In that case, there is some logic to recover (and ignore subsequent
+ * NMI interrupts that may eventually be raised), but the platform interrupt
+ * handler may not be able to distinguish this from other exception causes,
+ * which may cause a crash.
+ */
+
+static atomic_t __nmi_ipi_lock = ATOMIC_INIT(0);
+static struct cpumask nmi_ipi_pending_mask;
+static int nmi_ipi_busy_count = 0;
+static void (*nmi_ipi_function)(struct pt_regs *) = NULL;
+
+static void nmi_ipi_lock_start(unsigned long *flags)
+{
+	raw_local_irq_save(*flags);
+	hard_irq_disable();
+	while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) {
+		raw_local_irq_restore(*flags);
+		cpu_relax();
+		raw_local_irq_save(*flags);
+		hard_irq_disable();
+	}
+}
+
+static void nmi_ipi_lock(void)
+{
+	while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1)
+		cpu_relax();
+}
+
+static void nmi_ipi_unlock(void)
+{
+	smp_mb();
+	WARN_ON(atomic_read(&__nmi_ipi_lock) != 1);
+	atomic_set(&__nmi_ipi_lock, 0);
+}
+
+static void nmi_ipi_unlock_end(unsigned long *flags)
+{
+	nmi_ipi_unlock();
+	raw_local_irq_restore(*flags);
+}
+
+/*
+ * Platform NMI handler calls this to ack
+ */
+int smp_handle_nmi_ipi(struct pt_regs *regs)
+{
+	void (*fn)(struct pt_regs *);
+	unsigned long flags;
+	int me = raw_smp_processor_id();
+	int ret = 0;
+
+	/*
+	 * Unexpected NMIs are possible here because the interrupt may not
+	 * be able to distinguish NMI IPIs from other types of NMIs, or
+	 * because the caller may have timed out.
+	 */
+	nmi_ipi_lock_start(&flags);
+	if (!nmi_ipi_busy_count)
+		goto out;
+	if (!cpumask_test_cpu(me, &nmi_ipi_pending_mask))
+		goto out;
+
+	fn = nmi_ipi_function;
+	if (!fn)
+		goto out;
+
+	cpumask_clear_cpu(me, &nmi_ipi_pending_mask);
+	nmi_ipi_busy_count++;
+	nmi_ipi_unlock();
+
+	ret = 1;
+
+	fn(regs);
+
+	nmi_ipi_lock();
+	nmi_ipi_busy_count--;
+out:
+	nmi_ipi_unlock_end(&flags);
+
+	return ret;
+}
+
+static void do_smp_send_nmi_ipi(int cpu)
+{
+	if (smp_ops->cause_nmi_ipi && smp_ops->cause_nmi_ipi(cpu))
+		return;
+
+	if (cpu >= 0) {
+		do_message_pass(cpu, PPC_MSG_NMI_IPI);
+	} else {
+		int c;
+
+		for_each_online_cpu(c) {
+			if (c == raw_smp_processor_id())
+				continue;
+			do_message_pass(c, PPC_MSG_NMI_IPI);
+		}
+	}
+}
+
+/*
+ * - cpu is the target CPU (must not be this CPU), or NMI_IPI_ALL_OTHERS.
+ * - fn is the target callback function.
+ * - delay_us > 0 is the delay before giving up waiting for targets to
+ *   enter the handler, == 0 specifies indefinite delay.
+ */
+static int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us)
+{
+	unsigned long flags;
+	int me = raw_smp_processor_id();
+	int ret = 1;
+
+	BUG_ON(cpu == me);
+	BUG_ON(cpu < 0 && cpu != NMI_IPI_ALL_OTHERS);
+
+	if (unlikely(!smp_ops))
+		return 0;
+
+	/* Take the nmi_ipi_busy count/lock with interrupts hard disabled */
+	nmi_ipi_lock_start(&flags);
+	while (nmi_ipi_busy_count) {
+		nmi_ipi_unlock_end(&flags);
+		cpu_relax();
+		nmi_ipi_lock_start(&flags);
+	}
+
+	nmi_ipi_function = fn;
+
+	if (cpu < 0) {
+		/* ALL_OTHERS */
+		cpumask_copy(&nmi_ipi_pending_mask, cpu_online_mask);
+		cpumask_clear_cpu(me, &nmi_ipi_pending_mask);
+	} else {
+		/* cpumask starts clear */
+		cpumask_set_cpu(cpu, &nmi_ipi_pending_mask);
+	}
+	nmi_ipi_busy_count++;
+	nmi_ipi_unlock();
+
+	do_smp_send_nmi_ipi(cpu);
+
+	while (!cpumask_empty(&nmi_ipi_pending_mask)) {
+		udelay(1);
+		if (delay_us) {
+			delay_us--;
+			if (!delay_us)
+				break;
+		}
+	}
+
+	nmi_ipi_lock();
+	if (!cpumask_empty(&nmi_ipi_pending_mask)) {
+		/* Could not gather all CPUs */
+		ret = 0;
+		cpumask_clear(&nmi_ipi_pending_mask);
+	}
+	nmi_ipi_busy_count--;
+	nmi_ipi_unlock_end(&flags);
+
+	return ret;
+}
+#endif /* CONFIG_NMI_IPI */
+
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
 void tick_broadcast(const struct cpumask *mask)
 {
@@ -326,29 +506,22 @@ void tick_broadcast(const struct cpumask *mask)
 }
 #endif
 
-#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC_CORE)
-void smp_send_debugger_break(void)
+#ifdef CONFIG_DEBUGGER
+void debugger_ipi_callback(struct pt_regs *regs)
 {
-	int cpu;
-	int me = raw_smp_processor_id();
-
-	if (unlikely(!smp_ops))
-		return;
+	debugger_ipi(regs);
+}
 
-	for_each_online_cpu(cpu)
-		if (cpu != me)
-			do_message_pass(cpu, PPC_MSG_DEBUGGER_BREAK);
+void smp_send_debugger_break(void)
+{
+	smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, debugger_ipi_callback, 1000000);
 }
 #endif
 
 #ifdef CONFIG_KEXEC_CORE
 void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *))
 {
-	crash_ipi_function_ptr = crash_ipi_callback;
-	if (crash_ipi_callback) {
-		mb();
-		smp_send_debugger_break();
-	}
+	smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, crash_ipi_callback, 1000000);
 }
 #endif
 
@@ -439,7 +612,21 @@ int generic_cpu_disable(void)
 #ifdef CONFIG_PPC64
 	vdso_data->processorCount--;
 #endif
-	migrate_irqs();
+	/* Update affinity of all IRQs previously aimed at this CPU */
+	irq_migrate_all_off_this_cpu();
+
+	/*
+	 * Depending on the details of the interrupt controller, it's possible
+	 * that one of the interrupts we just migrated away from this CPU is
+	 * actually already pending on this CPU. If we leave it in that state
+	 * the interrupt will never be EOI'ed, and will never fire again. So
+	 * temporarily enable interrupts here, to allow any pending interrupt to
+	 * be received (and EOI'ed), before we take this CPU offline.
+	 */
+	local_irq_enable();
+	mdelay(1);
+	local_irq_disable();
+
 	return 0;
 }
 
@@ -521,6 +708,16 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 
 	cpu_idle_thread_init(cpu, tidle);
 
+	/*
+	 * The platform might need to allocate resources prior to bringing
+	 * up the CPU
+	 */
+	if (smp_ops->prepare_cpu) {
+		rc = smp_ops->prepare_cpu(cpu);
+		if (rc)
+			return rc;
+	}
+
 	/* Make sure callin-map entry is 0 (can be leftover a CPU
 	 * hotplug
 	 */
diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c
index 66711958493c..d534ed901538 100644
--- a/arch/powerpc/kernel/stacktrace.c
+++ b/arch/powerpc/kernel/stacktrace.c
@@ -59,7 +59,14 @@ EXPORT_SYMBOL_GPL(save_stack_trace);
 
 void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 {
-	save_context_stack(trace, tsk->thread.ksp, tsk, 0);
+	unsigned long sp;
+
+	if (tsk == current)
+		sp = current_stack_pointer();
+	else
+		sp = tsk->thread.ksp;
+
+	save_context_stack(trace, sp, tsk, 0);
 }
 EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
 
diff --git a/arch/powerpc/kernel/swsusp.c b/arch/powerpc/kernel/swsusp.c
index 6ae9bd5086a4..0050b2d2ff7a 100644
--- a/arch/powerpc/kernel/swsusp.c
+++ b/arch/powerpc/kernel/swsusp.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/suspend.h>
 #include <asm/current.h>
 #include <asm/mmu_context.h>
 #include <asm/switch_to.h>
diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c
index de04c9fbb5cd..a877bf8269fe 100644
--- a/arch/powerpc/kernel/syscalls.c
+++ b/arch/powerpc/kernel/syscalls.c
@@ -42,11 +42,11 @@
 #include <asm/unistd.h>
 #include <asm/asm-prototypes.h>
 
-static inline unsigned long do_mmap2(unsigned long addr, size_t len,
+static inline long do_mmap2(unsigned long addr, size_t len,
 			unsigned long prot, unsigned long flags,
 			unsigned long fd, unsigned long off, int shift)
 {
-	unsigned long ret = -EINVAL;
+	long ret = -EINVAL;
 
 	if (!arch_validate_prot(prot))
 		goto out;
@@ -62,16 +62,16 @@ out:
 	return ret;
 }
 
-unsigned long sys_mmap2(unsigned long addr, size_t len,
-			unsigned long prot, unsigned long flags,
-			unsigned long fd, unsigned long pgoff)
+SYSCALL_DEFINE6(mmap2, unsigned long, addr, size_t, len,
+		unsigned long, prot, unsigned long, flags,
+		unsigned long, fd, unsigned long, pgoff)
 {
 	return do_mmap2(addr, len, prot, flags, fd, pgoff, PAGE_SHIFT-12);
 }
 
-unsigned long sys_mmap(unsigned long addr, size_t len,
-		       unsigned long prot, unsigned long flags,
-		       unsigned long fd, off_t offset)
+SYSCALL_DEFINE6(mmap, unsigned long, addr, size_t, len,
+		unsigned long, prot, unsigned long, flags,
+		unsigned long, fd, off_t, offset)
 {
 	return do_mmap2(addr, len, prot, flags, fd, offset, PAGE_SHIFT);
 }
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index c1fb255a60d6..4437c70c7c2b 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -710,6 +710,10 @@ static int register_cpu_online(unsigned int cpu)
 	struct device_attribute *attrs, *pmc_attrs;
 	int i, nattrs;
 
+	/* For cpus present at boot a reference was already grabbed in register_cpu() */
+	if (!s->of_node)
+		s->of_node = of_get_cpu_node(cpu, NULL);
+
 #ifdef CONFIG_PPC64
 	if (cpu_has_feature(CPU_FTR_SMT))
 		device_create_file(s, &dev_attr_smt_snooze_delay);
@@ -785,9 +789,9 @@ static int register_cpu_online(unsigned int cpu)
 	return 0;
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
 static int unregister_cpu_online(unsigned int cpu)
 {
-#ifdef CONFIG_HOTPLUG_CPU
 	struct cpu *c = &per_cpu(cpu_devices, cpu);
 	struct device *s = &c->dev;
 	struct device_attribute *attrs, *pmc_attrs;
@@ -864,9 +868,13 @@ static int unregister_cpu_online(unsigned int cpu)
 	}
 #endif
 	cacheinfo_cpu_offline(cpu);
-#endif /* CONFIG_HOTPLUG_CPU */
+	of_node_put(s->of_node);
+	s->of_node = NULL;
 	return 0;
 }
+#else /* !CONFIG_HOTPLUG_CPU */
+#define unregister_cpu_online NULL
+#endif
 
 #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
 ssize_t arch_cpu_probe(const char *buf, size_t count)
diff --git a/arch/powerpc/kernel/trace/Makefile b/arch/powerpc/kernel/trace/Makefile
new file mode 100644
index 000000000000..729dffc5f7bc
--- /dev/null
+++ b/arch/powerpc/kernel/trace/Makefile
@@ -0,0 +1,29 @@
+#
+# Makefile for the powerpc trace subsystem
+#
+
+subdir-ccflags-$(CONFIG_PPC_WERROR)	:= -Werror
+
+ifdef CONFIG_FUNCTION_TRACER
+# do not trace tracer code
+CFLAGS_REMOVE_ftrace.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
+endif
+
+obj32-$(CONFIG_FUNCTION_TRACER)		+= ftrace_32.o
+obj64-$(CONFIG_FUNCTION_TRACER)		+= ftrace_64.o
+ifdef CONFIG_MPROFILE_KERNEL
+obj64-$(CONFIG_FUNCTION_TRACER)		+= ftrace_64_mprofile.o
+else
+obj64-$(CONFIG_FUNCTION_TRACER)		+= ftrace_64_pg.o
+endif
+obj-$(CONFIG_DYNAMIC_FTRACE)		+= ftrace.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
+obj-$(CONFIG_FTRACE_SYSCALLS)		+= ftrace.o
+obj-$(CONFIG_TRACING)			+= trace_clock.o
+
+obj-$(CONFIG_PPC64)			+= $(obj64-y)
+obj-$(CONFIG_PPC32)			+= $(obj32-y)
+
+# Disable GCOV & sanitizers in odd or sensitive code
+GCOV_PROFILE_ftrace.o := n
+UBSAN_SANITIZE_ftrace.o := n
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c
index 5c9f50c1aa99..32509de6ce4c 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -21,6 +21,7 @@
 #include <linux/init.h>
 #include <linux/list.h>
 
+#include <asm/asm-prototypes.h>
 #include <asm/cacheflush.h>
 #include <asm/code-patching.h>
 #include <asm/ftrace.h>
diff --git a/arch/powerpc/kernel/trace/ftrace_32.S b/arch/powerpc/kernel/trace/ftrace_32.S
new file mode 100644
index 000000000000..afef2c076282
--- /dev/null
+++ b/arch/powerpc/kernel/trace/ftrace_32.S
@@ -0,0 +1,118 @@
+/*
+ * Split from entry_32.S
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/magic.h>
+#include <asm/reg.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ftrace.h>
+#include <asm/export.h>
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+_GLOBAL(mcount)
+_GLOBAL(_mcount)
+	/*
+	 * It is required that _mcount on PPC32 must preserve the
+	 * link register. But we have r0 to play with. We use r0
+	 * to push the return address back to the caller of mcount
+	 * into the ctr register, restore the link register and
+	 * then jump back using the ctr register.
+	 */
+	mflr	r0
+	mtctr	r0
+	lwz	r0, 4(r1)
+	mtlr	r0
+	bctr
+
+_GLOBAL(ftrace_caller)
+	MCOUNT_SAVE_FRAME
+	/* r3 ends up with link register */
+	subi	r3, r3, MCOUNT_INSN_SIZE
+.globl ftrace_call
+ftrace_call:
+	bl	ftrace_stub
+	nop
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+	b	ftrace_graph_stub
+_GLOBAL(ftrace_graph_stub)
+#endif
+	MCOUNT_RESTORE_FRAME
+	/* old link register ends up in ctr reg */
+	bctr
+#else
+_GLOBAL(mcount)
+_GLOBAL(_mcount)
+
+	MCOUNT_SAVE_FRAME
+
+	subi	r3, r3, MCOUNT_INSN_SIZE
+	LOAD_REG_ADDR(r5, ftrace_trace_function)
+	lwz	r5,0(r5)
+
+	mtctr	r5
+	bctrl
+	nop
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	b	ftrace_graph_caller
+#endif
+	MCOUNT_RESTORE_FRAME
+	bctr
+#endif
+EXPORT_SYMBOL(_mcount)
+
+_GLOBAL(ftrace_stub)
+	blr
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+_GLOBAL(ftrace_graph_caller)
+	/* load r4 with local address */
+	lwz	r4, 44(r1)
+	subi	r4, r4, MCOUNT_INSN_SIZE
+
+	/* Grab the LR out of the caller stack frame */
+	lwz	r3,52(r1)
+
+	bl	prepare_ftrace_return
+	nop
+
+        /*
+         * prepare_ftrace_return gives us the address we divert to.
+         * Change the LR in the callers stack frame to this.
+         */
+	stw	r3,52(r1)
+
+	MCOUNT_RESTORE_FRAME
+	/* old link register ends up in ctr reg */
+	bctr
+
+_GLOBAL(return_to_handler)
+	/* need to save return values */
+	stwu	r1, -32(r1)
+	stw	r3, 20(r1)
+	stw	r4, 16(r1)
+	stw	r31, 12(r1)
+	mr	r31, r1
+
+	bl	ftrace_return_to_handler
+	nop
+
+	/* return value has real return address */
+	mtlr	r3
+
+	lwz	r3, 20(r1)
+	lwz	r4, 16(r1)
+	lwz	r31,12(r1)
+	lwz	r1, 0(r1)
+
+	/* Jump back to real return address */
+	blr
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/powerpc/kernel/trace/ftrace_64.S b/arch/powerpc/kernel/trace/ftrace_64.S
new file mode 100644
index 000000000000..e5ccea19821e
--- /dev/null
+++ b/arch/powerpc/kernel/trace/ftrace_64.S
@@ -0,0 +1,85 @@
+/*
+ * Split from entry_64.S
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/magic.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ftrace.h>
+#include <asm/ppc-opcode.h>
+#include <asm/export.h>
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+_GLOBAL(mcount)
+_GLOBAL(_mcount)
+EXPORT_SYMBOL(_mcount)
+	mflr	r12
+	mtctr	r12
+	mtlr	r0
+	bctr
+
+#else /* CONFIG_DYNAMIC_FTRACE */
+_GLOBAL_TOC(_mcount)
+EXPORT_SYMBOL(_mcount)
+	/* Taken from output of objdump from lib64/glibc */
+	mflr	r3
+	ld	r11, 0(r1)
+	stdu	r1, -112(r1)
+	std	r3, 128(r1)
+	ld	r4, 16(r11)
+
+	subi	r3, r3, MCOUNT_INSN_SIZE
+	LOAD_REG_ADDR(r5,ftrace_trace_function)
+	ld	r5,0(r5)
+	ld	r5,0(r5)
+	mtctr	r5
+	bctrl
+	nop
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	b	ftrace_graph_caller
+#endif
+	ld	r0, 128(r1)
+	mtlr	r0
+	addi	r1, r1, 112
+_GLOBAL(ftrace_stub)
+	blr
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+_GLOBAL(return_to_handler)
+	/* need to save return values */
+	std	r4,  -32(r1)
+	std	r3,  -24(r1)
+	/* save TOC */
+	std	r2,  -16(r1)
+	std	r31, -8(r1)
+	mr	r31, r1
+	stdu	r1, -112(r1)
+
+	/*
+	 * We might be called from a module.
+	 * Switch to our TOC to run inside the core kernel.
+	 */
+	ld	r2, PACATOC(r13)
+
+	bl	ftrace_return_to_handler
+	nop
+
+	/* return value has real return address */
+	mtlr	r3
+
+	ld	r1, 0(r1)
+	ld	r4,  -32(r1)
+	ld	r3,  -24(r1)
+	ld	r2,  -16(r1)
+	ld	r31, -8(r1)
+
+	/* Jump back to real return address */
+	blr
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
new file mode 100644
index 000000000000..7c933a99f5d5
--- /dev/null
+++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
@@ -0,0 +1,272 @@
+/*
+ * Split from ftrace_64.S
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/magic.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ftrace.h>
+#include <asm/ppc-opcode.h>
+#include <asm/export.h>
+#include <asm/thread_info.h>
+#include <asm/bug.h>
+#include <asm/ptrace.h>
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+/*
+ *
+ * ftrace_caller() is the function that replaces _mcount() when ftrace is
+ * active.
+ *
+ * We arrive here after a function A calls function B, and we are the trace
+ * function for B. When we enter r1 points to A's stack frame, B has not yet
+ * had a chance to allocate one yet.
+ *
+ * Additionally r2 may point either to the TOC for A, or B, depending on
+ * whether B did a TOC setup sequence before calling us.
+ *
+ * On entry the LR points back to the _mcount() call site, and r0 holds the
+ * saved LR as it was on entry to B, ie. the original return address at the
+ * call site in A.
+ *
+ * Our job is to save the register state into a struct pt_regs (on the stack)
+ * and then arrange for the ftrace function to be called.
+ */
+_GLOBAL(ftrace_caller)
+	/* Save the original return address in A's stack frame */
+	std	r0,LRSAVE(r1)
+
+	/* Create our stack frame + pt_regs */
+	stdu	r1,-SWITCH_FRAME_SIZE(r1)
+
+	/* Save all gprs to pt_regs */
+	SAVE_8GPRS(0,r1)
+	SAVE_8GPRS(8,r1)
+	SAVE_8GPRS(16,r1)
+	SAVE_8GPRS(24,r1)
+
+	/* Load special regs for save below */
+	mfmsr   r8
+	mfctr   r9
+	mfxer   r10
+	mfcr	r11
+
+	/* Get the _mcount() call site out of LR */
+	mflr	r7
+	/* Save it as pt_regs->nip */
+	std     r7, _NIP(r1)
+	/* Save the read LR in pt_regs->link */
+	std     r0, _LINK(r1)
+
+	/* Save callee's TOC in the ABI compliant location */
+	std	r2, 24(r1)
+	ld	r2,PACATOC(r13)	/* get kernel TOC in r2 */
+
+	addis	r3,r2,function_trace_op@toc@ha
+	addi	r3,r3,function_trace_op@toc@l
+	ld	r5,0(r3)
+
+#ifdef CONFIG_LIVEPATCH
+	mr	r14,r7		/* remember old NIP */
+#endif
+	/* Calculate ip from nip-4 into r3 for call below */
+	subi    r3, r7, MCOUNT_INSN_SIZE
+
+	/* Put the original return address in r4 as parent_ip */
+	mr	r4, r0
+
+	/* Save special regs */
+	std     r8, _MSR(r1)
+	std     r9, _CTR(r1)
+	std     r10, _XER(r1)
+	std     r11, _CCR(r1)
+
+	/* Load &pt_regs in r6 for call below */
+	addi    r6, r1 ,STACK_FRAME_OVERHEAD
+
+	/* ftrace_call(r3, r4, r5, r6) */
+.globl ftrace_call
+ftrace_call:
+	bl	ftrace_stub
+	nop
+
+	/* Load ctr with the possibly modified NIP */
+	ld	r3, _NIP(r1)
+	mtctr	r3
+#ifdef CONFIG_LIVEPATCH
+	cmpd	r14,r3		/* has NIP been altered? */
+#endif
+
+	/* Restore gprs */
+	REST_8GPRS(0,r1)
+	REST_8GPRS(8,r1)
+	REST_8GPRS(16,r1)
+	REST_8GPRS(24,r1)
+
+	/* Restore possibly modified LR */
+	ld	r0, _LINK(r1)
+	mtlr	r0
+
+	/* Restore callee's TOC */
+	ld	r2, 24(r1)
+
+	/* Pop our stack frame */
+	addi r1, r1, SWITCH_FRAME_SIZE
+
+#ifdef CONFIG_LIVEPATCH
+        /* Based on the cmpd above, if the NIP was altered handle livepatch */
+	bne-	livepatch_handler
+#endif
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+	b	ftrace_graph_stub
+_GLOBAL(ftrace_graph_stub)
+#endif
+
+	bctr			/* jump after _mcount site */
+
+_GLOBAL(ftrace_stub)
+	blr
+
+#ifdef CONFIG_LIVEPATCH
+	/*
+	 * This function runs in the mcount context, between two functions. As
+	 * such it can only clobber registers which are volatile and used in
+	 * function linkage.
+	 *
+	 * We get here when a function A, calls another function B, but B has
+	 * been live patched with a new function C.
+	 *
+	 * On entry:
+	 *  - we have no stack frame and can not allocate one
+	 *  - LR points back to the original caller (in A)
+	 *  - CTR holds the new NIP in C
+	 *  - r0 & r12 are free
+	 *
+	 * r0 can't be used as the base register for a DS-form load or store, so
+	 * we temporarily shuffle r1 (stack pointer) into r0 and then put it back.
+	 */
+livepatch_handler:
+	CURRENT_THREAD_INFO(r12, r1)
+
+	/* Save stack pointer into r0 */
+	mr	r0, r1
+
+	/* Allocate 3 x 8 bytes */
+	ld	r1, TI_livepatch_sp(r12)
+	addi	r1, r1, 24
+	std	r1, TI_livepatch_sp(r12)
+
+	/* Save toc & real LR on livepatch stack */
+	std	r2,  -24(r1)
+	mflr	r12
+	std	r12, -16(r1)
+
+	/* Store stack end marker */
+	lis     r12, STACK_END_MAGIC@h
+	ori     r12, r12, STACK_END_MAGIC@l
+	std	r12, -8(r1)
+
+	/* Restore real stack pointer */
+	mr	r1, r0
+
+	/* Put ctr in r12 for global entry and branch there */
+	mfctr	r12
+	bctrl
+
+	/*
+	 * Now we are returning from the patched function to the original
+	 * caller A. We are free to use r0 and r12, and we can use r2 until we
+	 * restore it.
+	 */
+
+	CURRENT_THREAD_INFO(r12, r1)
+
+	/* Save stack pointer into r0 */
+	mr	r0, r1
+
+	ld	r1, TI_livepatch_sp(r12)
+
+	/* Check stack marker hasn't been trashed */
+	lis     r2,  STACK_END_MAGIC@h
+	ori     r2,  r2, STACK_END_MAGIC@l
+	ld	r12, -8(r1)
+1:	tdne	r12, r2
+	EMIT_BUG_ENTRY 1b, __FILE__, __LINE__ - 1, 0
+
+	/* Restore LR & toc from livepatch stack */
+	ld	r12, -16(r1)
+	mtlr	r12
+	ld	r2,  -24(r1)
+
+	/* Pop livepatch stack frame */
+	CURRENT_THREAD_INFO(r12, r0)
+	subi	r1, r1, 24
+	std	r1, TI_livepatch_sp(r12)
+
+	/* Restore real stack pointer */
+	mr	r1, r0
+
+	/* Return to original caller of live patched function */
+	blr
+#endif /* CONFIG_LIVEPATCH */
+
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+_GLOBAL(ftrace_graph_caller)
+	stdu	r1, -112(r1)
+	/* with -mprofile-kernel, parameter regs are still alive at _mcount */
+	std	r10, 104(r1)
+	std	r9, 96(r1)
+	std	r8, 88(r1)
+	std	r7, 80(r1)
+	std	r6, 72(r1)
+	std	r5, 64(r1)
+	std	r4, 56(r1)
+	std	r3, 48(r1)
+
+	/* Save callee's TOC in the ABI compliant location */
+	std	r2, 24(r1)
+	ld	r2, PACATOC(r13)	/* get kernel TOC in r2 */
+
+	mfctr	r4		/* ftrace_caller has moved local addr here */
+	std	r4, 40(r1)
+	mflr	r3		/* ftrace_caller has restored LR from stack */
+	subi	r4, r4, MCOUNT_INSN_SIZE
+
+	bl	prepare_ftrace_return
+	nop
+
+	/*
+	 * prepare_ftrace_return gives us the address we divert to.
+	 * Change the LR to this.
+	 */
+	mtlr	r3
+
+	ld	r0, 40(r1)
+	mtctr	r0
+	ld	r10, 104(r1)
+	ld	r9, 96(r1)
+	ld	r8, 88(r1)
+	ld	r7, 80(r1)
+	ld	r6, 72(r1)
+	ld	r5, 64(r1)
+	ld	r4, 56(r1)
+	ld	r3, 48(r1)
+
+	/* Restore callee's TOC */
+	ld	r2, 24(r1)
+
+	addi	r1, r1, 112
+	mflr	r0
+	std	r0, LRSAVE(r1)
+	bctr
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.S b/arch/powerpc/kernel/trace/ftrace_64_pg.S
new file mode 100644
index 000000000000..f095358da96e
--- /dev/null
+++ b/arch/powerpc/kernel/trace/ftrace_64_pg.S
@@ -0,0 +1,68 @@
+/*
+ * Split from ftrace_64.S
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/magic.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ftrace.h>
+#include <asm/ppc-opcode.h>
+#include <asm/export.h>
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+_GLOBAL_TOC(ftrace_caller)
+	/* Taken from output of objdump from lib64/glibc */
+	mflr	r3
+	ld	r11, 0(r1)
+	stdu	r1, -112(r1)
+	std	r3, 128(r1)
+	ld	r4, 16(r11)
+	subi	r3, r3, MCOUNT_INSN_SIZE
+.globl ftrace_call
+ftrace_call:
+	bl	ftrace_stub
+	nop
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+	b	ftrace_graph_stub
+_GLOBAL(ftrace_graph_stub)
+#endif
+	ld	r0, 128(r1)
+	mtlr	r0
+	addi	r1, r1, 112
+
+_GLOBAL(ftrace_stub)
+	blr
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+_GLOBAL(ftrace_graph_caller)
+	/* load r4 with local address */
+	ld	r4, 128(r1)
+	subi	r4, r4, MCOUNT_INSN_SIZE
+
+	/* Grab the LR out of the caller stack frame */
+	ld	r11, 112(r1)
+	ld	r3, 16(r11)
+
+	bl	prepare_ftrace_return
+	nop
+
+	/*
+	 * prepare_ftrace_return gives us the address we divert to.
+	 * Change the LR in the callers stack frame to this.
+	 */
+	ld	r11, 112(r1)
+	std	r3, 16(r11)
+
+	ld	r0, 128(r1)
+	mtlr	r0
+	addi	r1, r1, 112
+	blr
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/powerpc/kernel/trace_clock.c b/arch/powerpc/kernel/trace/trace_clock.c
index 49170690946d..49170690946d 100644
--- a/arch/powerpc/kernel/trace_clock.c
+++ b/arch/powerpc/kernel/trace/trace_clock.c
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index ff365f9de27a..d4e545d27ef9 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -35,13 +35,13 @@
 #include <linux/backlight.h>
 #include <linux/bug.h>
 #include <linux/kdebug.h>
-#include <linux/debugfs.h>
 #include <linux/ratelimit.h>
 #include <linux/context_tracking.h>
 
 #include <asm/emulated_ops.h>
 #include <asm/pgtable.h>
 #include <linux/uaccess.h>
+#include <asm/debugfs.h>
 #include <asm/io.h>
 #include <asm/machdep.h>
 #include <asm/rtas.h>
@@ -279,18 +279,35 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr)
 
 void system_reset_exception(struct pt_regs *regs)
 {
+	/*
+	 * Avoid crashes in case of nested NMI exceptions. Recoverability
+	 * is determined by RI and in_nmi
+	 */
+	bool nested = in_nmi();
+	if (!nested)
+		nmi_enter();
+
 	/* See if any machine dependent calls */
 	if (ppc_md.system_reset_exception) {
 		if (ppc_md.system_reset_exception(regs))
-			return;
+			goto out;
 	}
 
 	die("System Reset", regs, SIGABRT);
 
+out:
+#ifdef CONFIG_PPC_BOOK3S_64
+	BUG_ON(get_paca()->in_nmi == 0);
+	if (get_paca()->in_nmi > 1)
+		panic("Unrecoverable nested System Reset");
+#endif
 	/* Must die if the interrupt is not recoverable */
 	if (!(regs->msr & MSR_RI))
 		panic("Unrecoverable System Reset");
 
+	if (!nested)
+		nmi_exit();
+
 	/* What should we do here? We could issue a shutdown or hard reset. */
 }
 
@@ -306,8 +323,6 @@ long machine_check_early(struct pt_regs *regs)
 
 	__this_cpu_inc(irq_stat.mce_exceptions);
 
-	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
-
 	if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
 		handled = cur_cpu_spec->machine_check_early(regs);
 	return handled;
@@ -741,6 +756,8 @@ void machine_check_exception(struct pt_regs *regs)
 
 	__this_cpu_inc(irq_stat.mce_exceptions);
 
+	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
 	/* See if any machine dependent calls. In theory, we would want
 	 * to call the CPU first, and call the ppc_md. one if the CPU
 	 * one returns a positive number. However there is existing code
@@ -1440,6 +1457,8 @@ void facility_unavailable_exception(struct pt_regs *regs)
 		[FSCR_TM_LG] = "TM",
 		[FSCR_EBB_LG] = "EBB",
 		[FSCR_TAR_LG] = "TAR",
+		[FSCR_MSGP_LG] = "MSGP",
+		[FSCR_SCV_LG] = "SCV",
 	};
 	char *facility = "unknown";
 	u64 value;
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 1c24c894c908..2f793be3d2b1 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -77,6 +77,8 @@ SECTIONS
 #endif
 	} :kernel
 
+	__head_end = .;
+
 	/*
 	 * If the build dies here, it's likely code in head_64.S is referencing
 	 * labels it can't reach, and the linker inserting stubs without the
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index b6b5c185bd92..aedacefd961d 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -20,6 +20,10 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/miscdevice.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
 
 #include <asm/reg.h>
 #include <asm/cputable.h>
@@ -31,10 +35,6 @@
 #include <asm/kvm_book3s.h>
 #include <asm/mmu_context.h>
 #include <asm/page.h>
-#include <linux/gfp.h>
-#include <linux/sched.h>
-#include <linux/vmalloc.h>
-#include <linux/highmem.h>
 
 #include "book3s.h"
 #include "trace.h"
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index a587e8f4fd26..74b0153780e3 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -229,6 +229,7 @@ void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
 
 static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
 {
+	unsigned long vsid_bits = VSID_BITS_65_256M;
 	struct kvmppc_sid_map *map;
 	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
 	u16 sid_map_mask;
@@ -257,7 +258,12 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
 		kvmppc_mmu_pte_flush(vcpu, 0, 0);
 		kvmppc_mmu_flush_segments(vcpu);
 	}
-	map->host_vsid = vsid_scramble(vcpu_book3s->proto_vsid_next++, 256M);
+
+	if (mmu_has_feature(MMU_FTR_68_BIT_VA))
+		vsid_bits = VSID_BITS_256M;
+
+	map->host_vsid = vsid_scramble(vcpu_book3s->proto_vsid_next++,
+				       VSID_MULTIPLIER_256M, vsid_bits);
 
 	map->guest_vsid = gvsid;
 	map->valid = true;
@@ -390,7 +396,7 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
 	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 	int err;
 
-	err = __init_new_context();
+	err = hash__alloc_context_id();
 	if (err < 0)
 		return -1;
 	vcpu3s->context_id[0] = err;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 1ec86d9e2a82..fadb75abfe37 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -35,6 +35,15 @@
 #include <linux/srcu.h>
 #include <linux/miscdevice.h>
 #include <linux/debugfs.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/kvm_irqfd.h>
+#include <linux/irqbypass.h>
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/of.h>
 
 #include <asm/reg.h>
 #include <asm/cputable.h>
@@ -58,15 +67,6 @@
 #include <asm/mmu.h>
 #include <asm/opal.h>
 #include <asm/xics.h>
-#include <linux/gfp.h>
-#include <linux/vmalloc.h>
-#include <linux/highmem.h>
-#include <linux/hugetlb.h>
-#include <linux/kvm_irqfd.h>
-#include <linux/irqbypass.h>
-#include <linux/module.h>
-#include <linux/compiler.h>
-#include <linux/of.h>
 
 #include "book3s.h"
 
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 4d6c64b3041c..a752e29977e0 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -23,6 +23,7 @@
 #include <asm/kvm_book3s.h>
 #include <asm/archrandom.h>
 #include <asm/xics.h>
+#include <asm/xive.h>
 #include <asm/dbell.h>
 #include <asm/cputhreads.h>
 #include <asm/io.h>
@@ -193,12 +194,6 @@ long kvmppc_h_random(struct kvm_vcpu *vcpu)
 	return H_HARDWARE;
 }
 
-static inline void rm_writeb(unsigned long paddr, u8 val)
-{
-	__asm__ __volatile__("stbcix %0,0,%1"
-		: : "r" (val), "r" (paddr) : "memory");
-}
-
 /*
  * Send an interrupt or message to another CPU.
  * The caller needs to include any barrier needed to order writes
@@ -206,7 +201,7 @@ static inline void rm_writeb(unsigned long paddr, u8 val)
  */
 void kvmhv_rm_send_ipi(int cpu)
 {
-	unsigned long xics_phys;
+	void __iomem *xics_phys;
 	unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
 
 	/* On POWER9 we can use msgsnd for any destination cpu. */
@@ -224,10 +219,14 @@ void kvmhv_rm_send_ipi(int cpu)
 		return;
 	}
 
+	/* We should never reach this */
+	if (WARN_ON_ONCE(xive_enabled()))
+	    return;
+
 	/* Else poke the target with an IPI */
 	xics_phys = paca[cpu].kvm_hstate.xics_phys;
 	if (xics_phys)
-		rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
+		__raw_rm_writeb(IPI_PRIORITY, xics_phys + XICS_MFRR);
 	else
 		opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY);
 }
@@ -386,6 +385,9 @@ long kvmppc_read_intr(void)
 	long rc;
 	bool again;
 
+	if (xive_enabled())
+		return 1;
+
 	do {
 		again = false;
 		rc = kvmppc_read_one_intr(&again);
@@ -397,7 +399,7 @@ long kvmppc_read_intr(void)
 
 static long kvmppc_read_one_intr(bool *again)
 {
-	unsigned long xics_phys;
+	void __iomem *xics_phys;
 	u32 h_xirr;
 	__be32 xirr;
 	u32 xisr;
@@ -415,7 +417,7 @@ static long kvmppc_read_one_intr(bool *again)
 	if (!xics_phys)
 		rc = opal_int_get_xirr(&xirr, false);
 	else
-		xirr = _lwzcix(xics_phys + XICS_XIRR);
+		xirr = __raw_rm_readl(xics_phys + XICS_XIRR);
 	if (rc < 0)
 		return 1;
 
@@ -445,8 +447,8 @@ static long kvmppc_read_one_intr(bool *again)
 	if (xisr == XICS_IPI) {
 		rc = 0;
 		if (xics_phys) {
-			_stbcix(xics_phys + XICS_MFRR, 0xff);
-			_stwcix(xics_phys + XICS_XIRR, xirr);
+			__raw_rm_writeb(0xff, xics_phys + XICS_MFRR);
+			__raw_rm_writel(xirr, xics_phys + XICS_XIRR);
 		} else {
 			opal_int_set_mfrr(hard_smp_processor_id(), 0xff);
 			rc = opal_int_eoi(h_xirr);
@@ -471,7 +473,8 @@ static long kvmppc_read_one_intr(bool *again)
 			 * we need to resend that IPI, bummer
 			 */
 			if (xics_phys)
-				_stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY);
+				__raw_rm_writeb(IPI_PRIORITY,
+						xics_phys + XICS_MFRR);
 			else
 				opal_int_set_mfrr(hard_smp_processor_id(),
 						  IPI_PRIORITY);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index e78542d99cd6..ffde4507ddfd 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -16,7 +16,6 @@
 #include <asm/kvm_ppc.h>
 #include <asm/hvcall.h>
 #include <asm/xics.h>
-#include <asm/debug.h>
 #include <asm/synch.h>
 #include <asm/cputhreads.h>
 #include <asm/pgtable.h>
@@ -766,7 +765,7 @@ unsigned long eoi_rc;
 
 static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
 {
-	unsigned long xics_phys;
+	void __iomem *xics_phys;
 	int64_t rc;
 
 	rc = pnv_opal_pci_msi_eoi(c, hwirq);
@@ -779,7 +778,7 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
 	/* EOI it */
 	xics_phys = local_paca->kvm_hstate.xics_phys;
 	if (xics_phys) {
-		_stwcix(xics_phys + XICS_XIRR, xirr);
+		__raw_rm_writel(xirr, xics_phys + XICS_XIRR);
 	} else {
 		rc = opal_int_eoi(be32_to_cpu(xirr));
 		*again = rc > 0;
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index e48803e2918d..459b72cb617a 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -19,10 +19,9 @@
 #include <asm/kvm_ppc.h>
 #include <asm/hvcall.h>
 #include <asm/xics.h>
-#include <asm/debug.h>
+#include <asm/debugfs.h>
 #include <asm/time.h>
 
-#include <linux/debugfs.h>
 #include <linux/seq_file.h>
 
 #include "book3s_xics.h"
@@ -1084,7 +1083,7 @@ static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm,
 	return xics->ics[icsid];
 }
 
-int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num)
+static int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num)
 {
 	struct kvmppc_icp *icp;
 
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 0d3002b7e2b4..500b0f6a0b64 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/kprobes.h>
 #include <linux/vmalloc.h>
 #include <linux/init.h>
 #include <linux/mm.h>
@@ -59,7 +60,7 @@ bool is_offset_in_branch_range(long offset)
  * Helper to check if a given instruction is a conditional branch
  * Derived from the conditional checks in analyse_instr()
  */
-bool __kprobes is_conditional_branch(unsigned int instr)
+bool is_conditional_branch(unsigned int instr)
 {
 	unsigned int opcode = instr >> 26;
 
@@ -75,6 +76,7 @@ bool __kprobes is_conditional_branch(unsigned int instr)
 	}
 	return false;
 }
+NOKPROBE_SYMBOL(is_conditional_branch);
 
 unsigned int create_branch(const unsigned int *addr,
 			   unsigned long target, int flags)
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 9c542ec70c5b..33117f8a0882 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -49,7 +49,8 @@ extern int do_stxvd2x(int rn, unsigned long ea);
 /*
  * Emulate the truncation of 64 bit values in 32-bit mode.
  */
-static unsigned long truncate_if_32bit(unsigned long msr, unsigned long val)
+static nokprobe_inline unsigned long truncate_if_32bit(unsigned long msr,
+							unsigned long val)
 {
 #ifdef __powerpc64__
 	if ((msr & MSR_64BIT) == 0)
@@ -61,7 +62,7 @@ static unsigned long truncate_if_32bit(unsigned long msr, unsigned long val)
 /*
  * Determine whether a conditional branch instruction would branch.
  */
-static int __kprobes branch_taken(unsigned int instr, struct pt_regs *regs)
+static nokprobe_inline int branch_taken(unsigned int instr, struct pt_regs *regs)
 {
 	unsigned int bo = (instr >> 21) & 0x1f;
 	unsigned int bi;
@@ -81,8 +82,7 @@ static int __kprobes branch_taken(unsigned int instr, struct pt_regs *regs)
 	return 1;
 }
 
-
-static long __kprobes address_ok(struct pt_regs *regs, unsigned long ea, int nb)
+static nokprobe_inline long address_ok(struct pt_regs *regs, unsigned long ea, int nb)
 {
 	if (!user_mode(regs))
 		return 1;
@@ -92,7 +92,7 @@ static long __kprobes address_ok(struct pt_regs *regs, unsigned long ea, int nb)
 /*
  * Calculate effective address for a D-form instruction
  */
-static unsigned long __kprobes dform_ea(unsigned int instr, struct pt_regs *regs)
+static nokprobe_inline unsigned long dform_ea(unsigned int instr, struct pt_regs *regs)
 {
 	int ra;
 	unsigned long ea;
@@ -109,7 +109,7 @@ static unsigned long __kprobes dform_ea(unsigned int instr, struct pt_regs *regs
 /*
  * Calculate effective address for a DS-form instruction
  */
-static unsigned long __kprobes dsform_ea(unsigned int instr, struct pt_regs *regs)
+static nokprobe_inline unsigned long dsform_ea(unsigned int instr, struct pt_regs *regs)
 {
 	int ra;
 	unsigned long ea;
@@ -126,8 +126,8 @@ static unsigned long __kprobes dsform_ea(unsigned int instr, struct pt_regs *reg
 /*
  * Calculate effective address for an X-form instruction
  */
-static unsigned long __kprobes xform_ea(unsigned int instr,
-					struct pt_regs *regs)
+static nokprobe_inline unsigned long xform_ea(unsigned int instr,
+						struct pt_regs *regs)
 {
 	int ra, rb;
 	unsigned long ea;
@@ -145,33 +145,33 @@ static unsigned long __kprobes xform_ea(unsigned int instr,
  * Return the largest power of 2, not greater than sizeof(unsigned long),
  * such that x is a multiple of it.
  */
-static inline unsigned long max_align(unsigned long x)
+static nokprobe_inline unsigned long max_align(unsigned long x)
 {
 	x |= sizeof(unsigned long);
 	return x & -x;		/* isolates rightmost bit */
 }
 
 
-static inline unsigned long byterev_2(unsigned long x)
+static nokprobe_inline unsigned long byterev_2(unsigned long x)
 {
 	return ((x >> 8) & 0xff) | ((x & 0xff) << 8);
 }
 
-static inline unsigned long byterev_4(unsigned long x)
+static nokprobe_inline unsigned long byterev_4(unsigned long x)
 {
 	return ((x >> 24) & 0xff) | ((x >> 8) & 0xff00) |
 		((x & 0xff00) << 8) | ((x & 0xff) << 24);
 }
 
 #ifdef __powerpc64__
-static inline unsigned long byterev_8(unsigned long x)
+static nokprobe_inline unsigned long byterev_8(unsigned long x)
 {
 	return (byterev_4(x) << 32) | byterev_4(x >> 32);
 }
 #endif
 
-static int __kprobes read_mem_aligned(unsigned long *dest, unsigned long ea,
-				      int nb)
+static nokprobe_inline int read_mem_aligned(unsigned long *dest,
+					unsigned long ea, int nb)
 {
 	int err = 0;
 	unsigned long x = 0;
@@ -197,8 +197,8 @@ static int __kprobes read_mem_aligned(unsigned long *dest, unsigned long ea,
 	return err;
 }
 
-static int __kprobes read_mem_unaligned(unsigned long *dest, unsigned long ea,
-					int nb, struct pt_regs *regs)
+static nokprobe_inline int read_mem_unaligned(unsigned long *dest,
+				unsigned long ea, int nb, struct pt_regs *regs)
 {
 	int err;
 	unsigned long x, b, c;
@@ -248,7 +248,7 @@ static int __kprobes read_mem_unaligned(unsigned long *dest, unsigned long ea,
  * Read memory at address ea for nb bytes, return 0 for success
  * or -EFAULT if an error occurred.
  */
-static int __kprobes read_mem(unsigned long *dest, unsigned long ea, int nb,
+static int read_mem(unsigned long *dest, unsigned long ea, int nb,
 			      struct pt_regs *regs)
 {
 	if (!address_ok(regs, ea, nb))
@@ -257,9 +257,10 @@ static int __kprobes read_mem(unsigned long *dest, unsigned long ea, int nb,
 		return read_mem_aligned(dest, ea, nb);
 	return read_mem_unaligned(dest, ea, nb, regs);
 }
+NOKPROBE_SYMBOL(read_mem);
 
-static int __kprobes write_mem_aligned(unsigned long val, unsigned long ea,
-				       int nb)
+static nokprobe_inline int write_mem_aligned(unsigned long val,
+					unsigned long ea, int nb)
 {
 	int err = 0;
 
@@ -282,8 +283,8 @@ static int __kprobes write_mem_aligned(unsigned long val, unsigned long ea,
 	return err;
 }
 
-static int __kprobes write_mem_unaligned(unsigned long val, unsigned long ea,
-					 int nb, struct pt_regs *regs)
+static nokprobe_inline int write_mem_unaligned(unsigned long val,
+				unsigned long ea, int nb, struct pt_regs *regs)
 {
 	int err;
 	unsigned long c;
@@ -325,7 +326,7 @@ static int __kprobes write_mem_unaligned(unsigned long val, unsigned long ea,
  * Write memory at address ea for nb bytes, return 0 for success
  * or -EFAULT if an error occurred.
  */
-static int __kprobes write_mem(unsigned long val, unsigned long ea, int nb,
+static int write_mem(unsigned long val, unsigned long ea, int nb,
 			       struct pt_regs *regs)
 {
 	if (!address_ok(regs, ea, nb))
@@ -334,13 +335,14 @@ static int __kprobes write_mem(unsigned long val, unsigned long ea, int nb,
 		return write_mem_aligned(val, ea, nb);
 	return write_mem_unaligned(val, ea, nb, regs);
 }
+NOKPROBE_SYMBOL(write_mem);
 
 #ifdef CONFIG_PPC_FPU
 /*
  * Check the address and alignment, and call func to do the actual
  * load or store.
  */
-static int __kprobes do_fp_load(int rn, int (*func)(int, unsigned long),
+static int do_fp_load(int rn, int (*func)(int, unsigned long),
 				unsigned long ea, int nb,
 				struct pt_regs *regs)
 {
@@ -380,8 +382,9 @@ static int __kprobes do_fp_load(int rn, int (*func)(int, unsigned long),
 		return err;
 	return (*func)(rn, ptr);
 }
+NOKPROBE_SYMBOL(do_fp_load);
 
-static int __kprobes do_fp_store(int rn, int (*func)(int, unsigned long),
+static int do_fp_store(int rn, int (*func)(int, unsigned long),
 				 unsigned long ea, int nb,
 				 struct pt_regs *regs)
 {
@@ -425,11 +428,12 @@ static int __kprobes do_fp_store(int rn, int (*func)(int, unsigned long),
 	}
 	return err;
 }
+NOKPROBE_SYMBOL(do_fp_store);
 #endif
 
 #ifdef CONFIG_ALTIVEC
 /* For Altivec/VMX, no need to worry about alignment */
-static int __kprobes do_vec_load(int rn, int (*func)(int, unsigned long),
+static nokprobe_inline int do_vec_load(int rn, int (*func)(int, unsigned long),
 				 unsigned long ea, struct pt_regs *regs)
 {
 	if (!address_ok(regs, ea & ~0xfUL, 16))
@@ -437,7 +441,7 @@ static int __kprobes do_vec_load(int rn, int (*func)(int, unsigned long),
 	return (*func)(rn, ea);
 }
 
-static int __kprobes do_vec_store(int rn, int (*func)(int, unsigned long),
+static nokprobe_inline int do_vec_store(int rn, int (*func)(int, unsigned long),
 				  unsigned long ea, struct pt_regs *regs)
 {
 	if (!address_ok(regs, ea & ~0xfUL, 16))
@@ -447,7 +451,7 @@ static int __kprobes do_vec_store(int rn, int (*func)(int, unsigned long),
 #endif /* CONFIG_ALTIVEC */
 
 #ifdef CONFIG_VSX
-static int __kprobes do_vsx_load(int rn, int (*func)(int, unsigned long),
+static nokprobe_inline int do_vsx_load(int rn, int (*func)(int, unsigned long),
 				 unsigned long ea, struct pt_regs *regs)
 {
 	int err;
@@ -465,7 +469,7 @@ static int __kprobes do_vsx_load(int rn, int (*func)(int, unsigned long),
 	return err;
 }
 
-static int __kprobes do_vsx_store(int rn, int (*func)(int, unsigned long),
+static nokprobe_inline int do_vsx_store(int rn, int (*func)(int, unsigned long),
 				 unsigned long ea, struct pt_regs *regs)
 {
 	int err;
@@ -522,7 +526,7 @@ static int __kprobes do_vsx_store(int rn, int (*func)(int, unsigned long),
 		: "=r" (err)				\
 		: "r" (addr), "i" (-EFAULT), "0" (err))
 
-static void __kprobes set_cr0(struct pt_regs *regs, int rd)
+static nokprobe_inline void set_cr0(struct pt_regs *regs, int rd)
 {
 	long val = regs->gpr[rd];
 
@@ -539,7 +543,7 @@ static void __kprobes set_cr0(struct pt_regs *regs, int rd)
 		regs->ccr |= 0x20000000;
 }
 
-static void __kprobes add_with_carry(struct pt_regs *regs, int rd,
+static nokprobe_inline void add_with_carry(struct pt_regs *regs, int rd,
 				     unsigned long val1, unsigned long val2,
 				     unsigned long carry_in)
 {
@@ -560,7 +564,7 @@ static void __kprobes add_with_carry(struct pt_regs *regs, int rd,
 		regs->xer &= ~XER_CA;
 }
 
-static void __kprobes do_cmp_signed(struct pt_regs *regs, long v1, long v2,
+static nokprobe_inline void do_cmp_signed(struct pt_regs *regs, long v1, long v2,
 				    int crfld)
 {
 	unsigned int crval, shift;
@@ -576,7 +580,7 @@ static void __kprobes do_cmp_signed(struct pt_regs *regs, long v1, long v2,
 	regs->ccr = (regs->ccr & ~(0xf << shift)) | (crval << shift);
 }
 
-static void __kprobes do_cmp_unsigned(struct pt_regs *regs, unsigned long v1,
+static nokprobe_inline void do_cmp_unsigned(struct pt_regs *regs, unsigned long v1,
 				      unsigned long v2, int crfld)
 {
 	unsigned int crval, shift;
@@ -592,7 +596,7 @@ static void __kprobes do_cmp_unsigned(struct pt_regs *regs, unsigned long v1,
 	regs->ccr = (regs->ccr & ~(0xf << shift)) | (crval << shift);
 }
 
-static int __kprobes trap_compare(long v1, long v2)
+static nokprobe_inline int trap_compare(long v1, long v2)
 {
 	int ret = 0;
 
@@ -631,7 +635,7 @@ static int __kprobes trap_compare(long v1, long v2)
  * Returns 1 if the instruction has been executed, or 0 if not.
  * Sets *op to indicate what the instruction does.
  */
-int __kprobes analyse_instr(struct instruction_op *op, struct pt_regs *regs,
+int analyse_instr(struct instruction_op *op, struct pt_regs *regs,
 			    unsigned int instr)
 {
 	unsigned int opcode, ra, rb, rd, spr, u;
@@ -1692,6 +1696,7 @@ int __kprobes analyse_instr(struct instruction_op *op, struct pt_regs *regs,
 #endif
 }
 EXPORT_SYMBOL_GPL(analyse_instr);
+NOKPROBE_SYMBOL(analyse_instr);
 
 /*
  * For PPC32 we always use stwu with r1 to change the stack pointer.
@@ -1701,7 +1706,7 @@ EXPORT_SYMBOL_GPL(analyse_instr);
  * don't emulate the real store operation. We will do real store
  * operation safely in exception return code by checking this flag.
  */
-static __kprobes int handle_stack_update(unsigned long ea, struct pt_regs *regs)
+static nokprobe_inline int handle_stack_update(unsigned long ea, struct pt_regs *regs)
 {
 #ifdef CONFIG_PPC32
 	/*
@@ -1721,7 +1726,7 @@ static __kprobes int handle_stack_update(unsigned long ea, struct pt_regs *regs)
 	return 0;
 }
 
-static __kprobes void do_signext(unsigned long *valp, int size)
+static nokprobe_inline void do_signext(unsigned long *valp, int size)
 {
 	switch (size) {
 	case 2:
@@ -1733,7 +1738,7 @@ static __kprobes void do_signext(unsigned long *valp, int size)
 	}
 }
 
-static __kprobes void do_byterev(unsigned long *valp, int size)
+static nokprobe_inline void do_byterev(unsigned long *valp, int size)
 {
 	switch (size) {
 	case 2:
@@ -1757,7 +1762,7 @@ static __kprobes void do_byterev(unsigned long *valp, int size)
  * or -1 if the instruction is one that should not be stepped,
  * such as an rfid, or a mtmsrd that would clear MSR_RI.
  */
-int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
+int emulate_step(struct pt_regs *regs, unsigned int instr)
 {
 	struct instruction_op op;
 	int r, err, size;
@@ -1988,3 +1993,4 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
 	regs->nip = truncate_if_32bit(regs->msr, regs->nip + 4);
 	return 1;
 }
+NOKPROBE_SYMBOL(emulate_step);
diff --git a/arch/powerpc/mm/dump_hashpagetable.c b/arch/powerpc/mm/dump_hashpagetable.c
index d979709a0239..c6b900f54c07 100644
--- a/arch/powerpc/mm/dump_hashpagetable.c
+++ b/arch/powerpc/mm/dump_hashpagetable.c
@@ -468,7 +468,7 @@ static void walk_linearmapping(struct pg_state *st)
 	unsigned long psize = 1 << mmu_psize_defs[mmu_linear_psize].shift;
 
 	for (addr = PAGE_OFFSET; addr < PAGE_OFFSET +
-			memblock_phys_mem_size(); addr += psize)
+			memblock_end_of_DRAM(); addr += psize)
 		hpte_find(st, addr, mmu_linear_psize);
 }
 
diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c
index 49abaf4dc8e3..d659345a98d6 100644
--- a/arch/powerpc/mm/dump_linuxpagetables.c
+++ b/arch/powerpc/mm/dump_linuxpagetables.c
@@ -26,6 +26,10 @@
 #include <asm/page.h>
 #include <asm/pgalloc.h>
 
+#ifdef CONFIG_PPC32
+#define KERN_VIRT_START	0
+#endif
+
 /*
  * To visualise what is happening,
  *
@@ -56,6 +60,8 @@ struct pg_state {
 	struct seq_file *seq;
 	const struct addr_marker *marker;
 	unsigned long start_address;
+	unsigned long start_pa;
+	unsigned long last_pa;
 	unsigned int level;
 	u64 current_flags;
 };
@@ -69,6 +75,7 @@ static struct addr_marker address_markers[] = {
 	{ 0,	"Start of kernel VM" },
 	{ 0,	"vmalloc() Area" },
 	{ 0,	"vmalloc() End" },
+#ifdef CONFIG_PPC64
 	{ 0,	"isa I/O start" },
 	{ 0,	"isa I/O end" },
 	{ 0,	"phb I/O start" },
@@ -76,6 +83,20 @@ static struct addr_marker address_markers[] = {
 	{ 0,	"I/O remap start" },
 	{ 0,	"I/O remap end" },
 	{ 0,	"vmemmap start" },
+#else
+	{ 0,	"Early I/O remap start" },
+	{ 0,	"Early I/O remap end" },
+#ifdef CONFIG_NOT_COHERENT_CACHE
+	{ 0,	"Consistent mem start" },
+	{ 0,	"Consistent mem end" },
+#endif
+#ifdef CONFIG_HIGHMEM
+	{ 0,	"Highmem PTEs start" },
+	{ 0,	"Highmem PTEs end" },
+#endif
+	{ 0,	"Fixmap start" },
+	{ 0,	"Fixmap end" },
+#endif
 	{ -1,	NULL },
 };
 
@@ -100,8 +121,13 @@ static const struct flag_info flag_array[] = {
 		.set	= "user",
 		.clear	= "    ",
 	}, {
+#if _PAGE_RO == 0
 		.mask	= _PAGE_RW,
 		.val	= _PAGE_RW,
+#else
+		.mask	= _PAGE_RO,
+		.val	= 0,
+#endif
 		.set	= "rw",
 		.clear	= "ro",
 	}, {
@@ -154,11 +180,24 @@ static const struct flag_info flag_array[] = {
 		.clear	= "             ",
 	}, {
 #endif
+#ifndef CONFIG_PPC_BOOK3S_64
 		.mask	= _PAGE_NO_CACHE,
 		.val	= _PAGE_NO_CACHE,
 		.set	= "no cache",
 		.clear	= "        ",
 	}, {
+#else
+		.mask	= _PAGE_NON_IDEMPOTENT,
+		.val	= _PAGE_NON_IDEMPOTENT,
+		.set	= "non-idempotent",
+		.clear	= "              ",
+	}, {
+		.mask	= _PAGE_TOLERANT,
+		.val	= _PAGE_TOLERANT,
+		.set	= "tolerant",
+		.clear	= "        ",
+	}, {
+#endif
 #ifdef CONFIG_PPC_BOOK3S_64
 		.mask	= H_PAGE_BUSY,
 		.val	= H_PAGE_BUSY,
@@ -188,6 +227,10 @@ static const struct flag_info flag_array[] = {
 		.mask	= _PAGE_SPECIAL,
 		.val	= _PAGE_SPECIAL,
 		.set	= "special",
+	}, {
+		.mask	= _PAGE_SHARED,
+		.val	= _PAGE_SHARED,
+		.set	= "shared",
 	}
 };
 
@@ -252,7 +295,14 @@ static void dump_addr(struct pg_state *st, unsigned long addr)
 	const char *unit = units;
 	unsigned long delta;
 
-	seq_printf(st->seq, "0x%016lx-0x%016lx   ", st->start_address, addr-1);
+#ifdef CONFIG_PPC64
+	seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr-1);
+	seq_printf(st->seq, "0x%016lx ", st->start_pa);
+#else
+	seq_printf(st->seq, "0x%08lx-0x%08lx ", st->start_address, addr - 1);
+	seq_printf(st->seq, "0x%08lx ", st->start_pa);
+#endif
+
 	delta = (addr - st->start_address) >> 10;
 	/* Work out what appropriate unit to use */
 	while (!(delta & 1023) && unit[1]) {
@@ -267,11 +317,15 @@ static void note_page(struct pg_state *st, unsigned long addr,
 	       unsigned int level, u64 val)
 {
 	u64 flag = val & pg_level[level].mask;
+	u64 pa = val & PTE_RPN_MASK;
+
 	/* At first no level is set */
 	if (!st->level) {
 		st->level = level;
 		st->current_flags = flag;
 		st->start_address = addr;
+		st->start_pa = pa;
+		st->last_pa = pa;
 		seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
 	/*
 	 * Dump the section of virtual memory when:
@@ -279,9 +333,11 @@ static void note_page(struct pg_state *st, unsigned long addr,
 	 *   - we change levels in the tree.
 	 *   - the address is in a different section of memory and is thus
 	 *   used for a different purpose, regardless of the flags.
+	 *   - the pa of this page is not adjacent to the last inspected page
 	 */
 	} else if (flag != st->current_flags || level != st->level ||
-		   addr >= st->marker[1].start_address) {
+		   addr >= st->marker[1].start_address ||
+		   pa != st->last_pa + PAGE_SIZE) {
 
 		/* Check the PTE flags */
 		if (st->current_flags) {
@@ -305,8 +361,12 @@ static void note_page(struct pg_state *st, unsigned long addr,
 			seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
 		}
 		st->start_address = addr;
+		st->start_pa = pa;
+		st->last_pa = pa;
 		st->current_flags = flag;
 		st->level = level;
+	} else {
+		st->last_pa = pa;
 	}
 }
 
@@ -377,20 +437,38 @@ static void walk_pagetables(struct pg_state *st)
 
 static void populate_markers(void)
 {
-	address_markers[0].start_address = PAGE_OFFSET;
-	address_markers[1].start_address = VMALLOC_START;
-	address_markers[2].start_address = VMALLOC_END;
-	address_markers[3].start_address = ISA_IO_BASE;
-	address_markers[4].start_address = ISA_IO_END;
-	address_markers[5].start_address = PHB_IO_BASE;
-	address_markers[6].start_address = PHB_IO_END;
-	address_markers[7].start_address = IOREMAP_BASE;
-	address_markers[8].start_address = IOREMAP_END;
+	int i = 0;
+
+	address_markers[i++].start_address = PAGE_OFFSET;
+	address_markers[i++].start_address = VMALLOC_START;
+	address_markers[i++].start_address = VMALLOC_END;
+#ifdef CONFIG_PPC64
+	address_markers[i++].start_address = ISA_IO_BASE;
+	address_markers[i++].start_address = ISA_IO_END;
+	address_markers[i++].start_address = PHB_IO_BASE;
+	address_markers[i++].start_address = PHB_IO_END;
+	address_markers[i++].start_address = IOREMAP_BASE;
+	address_markers[i++].start_address = IOREMAP_END;
 #ifdef CONFIG_PPC_STD_MMU_64
-	address_markers[9].start_address =  H_VMEMMAP_BASE;
+	address_markers[i++].start_address =  H_VMEMMAP_BASE;
 #else
-	address_markers[9].start_address =  VMEMMAP_BASE;
+	address_markers[i++].start_address =  VMEMMAP_BASE;
+#endif
+#else /* !CONFIG_PPC64 */
+	address_markers[i++].start_address = ioremap_bot;
+	address_markers[i++].start_address = IOREMAP_TOP;
+#ifdef CONFIG_NOT_COHERENT_CACHE
+	address_markers[i++].start_address = IOREMAP_TOP;
+	address_markers[i++].start_address = IOREMAP_TOP +
+					     CONFIG_CONSISTENT_SIZE;
+#endif
+#ifdef CONFIG_HIGHMEM
+	address_markers[i++].start_address = PKMAP_BASE;
+	address_markers[i++].start_address = PKMAP_ADDR(LAST_PKMAP);
 #endif
+	address_markers[i++].start_address = FIXADDR_START;
+	address_markers[i++].start_address = FIXADDR_TOP;
+#endif /* CONFIG_PPC64 */
 }
 
 static int ptdump_show(struct seq_file *m, void *v)
@@ -435,7 +513,7 @@ static int ptdump_init(void)
 
 	populate_markers();
 	build_pgtable_complete_mask();
-	debugfs_file = debugfs_create_file("kernel_pagetables", 0400, NULL,
+	debugfs_file = debugfs_create_file("kernel_page_tables", 0400, NULL,
 			NULL, &ptdump_fops);
 	return debugfs_file ? 0 : -ENOMEM;
 }
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 51def8a515be..3a7d580fdc59 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -120,8 +120,6 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address,
 	siginfo_t info;
 	unsigned int lsb = 0;
 
-	up_read(&current->mm->mmap_sem);
-
 	if (!user_mode(regs))
 		return MM_FAULT_ERR(SIGBUS);
 
@@ -154,13 +152,6 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
 	 * continue the pagefault.
 	 */
 	if (fatal_signal_pending(current)) {
-		/*
-		 * If we have retry set, the mmap semaphore will have
-		 * alrady been released in __lock_page_or_retry(). Else
-		 * we release it now.
-		 */
-		if (!(fault & VM_FAULT_RETRY))
-			up_read(&current->mm->mmap_sem);
 		/* Coming from kernel, we need to deal with uaccess fixups */
 		if (user_mode(regs))
 			return MM_FAULT_RETURN;
@@ -173,8 +164,6 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
 
 	/* Out of memory */
 	if (fault & VM_FAULT_OOM) {
-		up_read(&current->mm->mmap_sem);
-
 		/*
 		 * We ran out of memory, or some other thing happened to us that
 		 * made us unable to handle the page fault gracefully.
@@ -298,7 +287,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 	 * can result in fault, which will cause a deadlock when called with
 	 * mmap_sem held
 	 */
-	if (user_mode(regs))
+	if (!is_exec && user_mode(regs))
 		store_update_sp = store_updates_sp(regs);
 
 	if (user_mode(regs))
@@ -458,9 +447,30 @@ good_area:
 	 * the fault.
 	 */
 	fault = handle_mm_fault(vma, address, flags);
+
+	/*
+	 * Handle the retry right now, the mmap_sem has been released in that
+	 * case.
+	 */
+	if (unlikely(fault & VM_FAULT_RETRY)) {
+		/* We retry only once */
+		if (flags & FAULT_FLAG_ALLOW_RETRY) {
+			/*
+			 * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
+			 * of starvation.
+			 */
+			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			flags |= FAULT_FLAG_TRIED;
+			if (!fatal_signal_pending(current))
+				goto retry;
+		}
+		/* We will enter mm_fault_error() below */
+	} else
+		up_read(&current->mm->mmap_sem);
+
 	if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
 		if (fault & VM_FAULT_SIGSEGV)
-			goto bad_area;
+			goto bad_area_nosemaphore;
 		rc = mm_fault_error(regs, address, fault);
 		if (rc >= MM_FAULT_RETURN)
 			goto bail;
@@ -469,41 +479,29 @@ good_area:
 	}
 
 	/*
-	 * Major/minor page fault accounting is only done on the
-	 * initial attempt. If we go through a retry, it is extremely
-	 * likely that the page will be found in page cache at that point.
+	 * Major/minor page fault accounting.
 	 */
-	if (flags & FAULT_FLAG_ALLOW_RETRY) {
-		if (fault & VM_FAULT_MAJOR) {
-			current->maj_flt++;
-			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
-				      regs, address);
+	if (fault & VM_FAULT_MAJOR) {
+		current->maj_flt++;
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
+			      regs, address);
 #ifdef CONFIG_PPC_SMLPAR
-			if (firmware_has_feature(FW_FEATURE_CMO)) {
-				u32 page_ins;
-
-				preempt_disable();
-				page_ins = be32_to_cpu(get_lppaca()->page_ins);
-				page_ins += 1 << PAGE_FACTOR;
-				get_lppaca()->page_ins = cpu_to_be32(page_ins);
-				preempt_enable();
-			}
-#endif /* CONFIG_PPC_SMLPAR */
-		} else {
-			current->min_flt++;
-			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
-				      regs, address);
-		}
-		if (fault & VM_FAULT_RETRY) {
-			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
-			 * of starvation. */
-			flags &= ~FAULT_FLAG_ALLOW_RETRY;
-			flags |= FAULT_FLAG_TRIED;
-			goto retry;
+		if (firmware_has_feature(FW_FEATURE_CMO)) {
+			u32 page_ins;
+
+			preempt_disable();
+			page_ins = be32_to_cpu(get_lppaca()->page_ins);
+			page_ins += 1 << PAGE_FACTOR;
+			get_lppaca()->page_ins = cpu_to_be32(page_ins);
+			preempt_enable();
 		}
+#endif /* CONFIG_PPC_SMLPAR */
+	} else {
+		current->min_flt++;
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
+			      regs, address);
 	}
 
-	up_read(&mm->mmap_sem);
 	goto bail;
 
 bad_area:
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index 09cc50c8dace..6f962e5cb5e1 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -31,10 +31,8 @@
 #ifdef CONFIG_SMP
 	.section .bss
 	.align	2
-	.globl mmu_hash_lock
 mmu_hash_lock:
 	.space	4
-EXPORT_SYMBOL(mmu_hash_lock)
 #endif /* CONFIG_SMP */
 
 /*
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index c554768b1fa2..f2095ce9d4b0 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -35,9 +35,8 @@
 #include <linux/memblock.h>
 #include <linux/context_tracking.h>
 #include <linux/libfdt.h>
-#include <linux/debugfs.h>
 
-#include <asm/debug.h>
+#include <asm/debugfs.h>
 #include <asm/processor.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
@@ -927,11 +926,6 @@ static void __init htab_initialize(void)
 	}
 #endif /* CONFIG_DEBUG_PAGEALLOC */
 
-	/* On U3 based machines, we need to reserve the DART area and
-	 * _NOT_ map it to avoid cache paradoxes as it's remapped non
-	 * cacheable later on
-	 */
-
 	/* create bolted the linear mapping in the hash table */
 	for_each_memblock(memory, reg) {
 		base = (unsigned long)__va(reg->base);
@@ -981,6 +975,19 @@ void __init hash__early_init_devtree(void)
 
 void __init hash__early_init_mmu(void)
 {
+	/*
+	 * We have code in __hash_page_64K() and elsewhere, which assumes it can
+	 * do the following:
+	 *   new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & (H_PAGE_F_SECOND | H_PAGE_F_GIX);
+	 *
+	 * Where the slot number is between 0-15, and values of 8-15 indicate
+	 * the secondary bucket. For that code to work H_PAGE_F_SECOND and
+	 * H_PAGE_F_GIX must occupy four contiguous bits in the PTE, and
+	 * H_PAGE_F_SECOND must be placed above H_PAGE_F_GIX. Assert that here
+	 * with a BUILD_BUG_ON().
+	 */
+	BUILD_BUG_ON(H_PAGE_F_SECOND != (1ul  << (H_PAGE_F_GIX_SHIFT + 3)));
+
 	htab_init_page_sizes();
 
 	/*
@@ -1120,7 +1127,7 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
 	copro_flush_all_slbs(mm);
 	if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) {
 
-		copy_mm_to_paca(&mm->context);
+		copy_mm_to_paca(mm);
 		slb_flush_and_rebolt();
 	}
 }
@@ -1192,7 +1199,7 @@ static void check_paca_psize(unsigned long ea, struct mm_struct *mm,
 {
 	if (user_region) {
 		if (psize != get_paca_psize(ea)) {
-			copy_mm_to_paca(&mm->context);
+			copy_mm_to_paca(mm);
 			slb_flush_and_rebolt();
 		}
 	} else if (get_paca()->vmalloc_sllp !=
@@ -1855,5 +1862,4 @@ static int __init hash64_debugfs(void)
 	return 0;
 }
 machine_device_initcall(pseries, hash64_debugfs);
-
 #endif /* CONFIG_DEBUG_FS */
diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c
index 83a8be791e06..bfe4e8526b2d 100644
--- a/arch/powerpc/mm/hugetlbpage-book3e.c
+++ b/arch/powerpc/mm/hugetlbpage-book3e.c
@@ -148,16 +148,9 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
 
 	mm = vma->vm_mm;
 
-#ifdef CONFIG_PPC_MM_SLICES
-	psize = get_slice_psize(mm, ea);
-	tsize = mmu_get_tsize(psize);
-	shift = mmu_psize_defs[psize].shift;
-#else
 	psize = vma_mmu_pagesize(vma);
 	shift = __ilog2(psize);
 	tsize = shift - 10;
-#endif
-
 	/*
 	 * We can't be interrupted while we're setting up the MAS
 	 * regusters or after we've confirmed that no tlb exists.
diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c
index 35254a678456..6575b9aabef4 100644
--- a/arch/powerpc/mm/hugetlbpage-radix.c
+++ b/arch/powerpc/mm/hugetlbpage-radix.c
@@ -50,9 +50,12 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	struct hstate *h = hstate_file(file);
 	struct vm_unmapped_area_info info;
 
+	if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE))
+		mm->context.addr_limit = TASK_SIZE;
+
 	if (len & ~huge_page_mask(h))
 		return -EINVAL;
-	if (len > TASK_SIZE)
+	if (len > mm->task_size)
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED) {
@@ -64,7 +67,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	if (addr) {
 		addr = ALIGN(addr, huge_page_size(h));
 		vma = find_vma(mm, addr);
-		if (TASK_SIZE - len >= addr &&
+		if (mm->task_size - len >= addr &&
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
@@ -78,5 +81,9 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	info.high_limit = current->mm->mmap_base;
 	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
 	info.align_offset = 0;
+
+	if (addr > DEFAULT_MAP_WINDOW)
+		info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW;
+
 	return vm_unmapped_area(&info);
 }
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 8c3389cbcd12..a4f33de4008e 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -753,6 +753,24 @@ static int __init add_huge_page_size(unsigned long long size)
 	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
 		return -EINVAL;
 
+#ifdef CONFIG_PPC_BOOK3S_64
+	/*
+	 * We need to make sure that for different page sizes reported by
+	 * firmware we only add hugetlb support for page sizes that can be
+	 * supported by linux page table layout.
+	 * For now we have
+	 * Radix: 2M
+	 * Hash: 16M and 16G
+	 */
+	if (radix_enabled()) {
+		if (mmu_psize != MMU_PAGE_2M)
+			return -EINVAL;
+	} else {
+		if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
+			return -EINVAL;
+	}
+#endif
+
 	BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
 
 	/* Return if huge page size has already been setup */
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index c22f207aa656..ec84b31c6c86 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -71,10 +71,6 @@
 #if H_PGTABLE_RANGE > USER_VSID_RANGE
 #warning Limited user VSID range means pagetable space is wasted
 #endif
-
-#if (TASK_SIZE_USER64 < H_PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
-#warning TASK_SIZE is smaller than it needs to be.
-#endif
 #endif /* CONFIG_PPC_STD_MMU_64 */
 
 phys_addr_t memstart_addr = ~0;
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index a5d9ef59debe..9dbd2a733d6b 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -59,13 +59,14 @@ static inline int mmap_is_legacy(void)
 
 unsigned long arch_mmap_rnd(void)
 {
-	unsigned long rnd;
+	unsigned long shift, rnd;
 
-	/* 8MB for 32bit, 1GB for 64bit */
+	shift = mmap_rnd_bits;
+#ifdef CONFIG_COMPAT
 	if (is_32bit_task())
-		rnd = get_random_long() % (1<<(23-PAGE_SHIFT));
-	else
-		rnd = get_random_long() % (1UL<<(30-PAGE_SHIFT));
+		shift = mmap_rnd_compat_bits;
+#endif
+	rnd = get_random_long() % (1ul << shift);
 
 	return rnd << PAGE_SHIFT;
 }
@@ -79,7 +80,7 @@ static inline unsigned long mmap_base(unsigned long rnd)
 	else if (gap > MAX_GAP)
 		gap = MAX_GAP;
 
-	return PAGE_ALIGN(TASK_SIZE - gap - rnd);
+	return PAGE_ALIGN(DEFAULT_MAP_WINDOW - gap - rnd);
 }
 
 #ifdef CONFIG_PPC_RADIX_MMU
@@ -97,7 +98,11 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	struct vm_area_struct *vma;
 	struct vm_unmapped_area_info info;
 
-	if (len > TASK_SIZE - mmap_min_addr)
+	if (unlikely(addr > mm->context.addr_limit &&
+		     mm->context.addr_limit != TASK_SIZE))
+		mm->context.addr_limit = TASK_SIZE;
+
+	if (len > mm->task_size - mmap_min_addr)
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED)
@@ -106,7 +111,7 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	if (addr) {
 		addr = PAGE_ALIGN(addr);
 		vma = find_vma(mm, addr);
-		if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+		if (mm->task_size - len >= addr && addr >= mmap_min_addr &&
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
@@ -114,8 +119,13 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	info.flags = 0;
 	info.length = len;
 	info.low_limit = mm->mmap_base;
-	info.high_limit = TASK_SIZE;
 	info.align_mask = 0;
+
+	if (unlikely(addr > DEFAULT_MAP_WINDOW))
+		info.high_limit = mm->context.addr_limit;
+	else
+		info.high_limit = DEFAULT_MAP_WINDOW;
+
 	return vm_unmapped_area(&info);
 }
 
@@ -131,8 +141,12 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
 	unsigned long addr = addr0;
 	struct vm_unmapped_area_info info;
 
+	if (unlikely(addr > mm->context.addr_limit &&
+		     mm->context.addr_limit != TASK_SIZE))
+		mm->context.addr_limit = TASK_SIZE;
+
 	/* requested length too big for entire address space */
-	if (len > TASK_SIZE - mmap_min_addr)
+	if (len > mm->task_size - mmap_min_addr)
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED)
@@ -142,7 +156,7 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
 	if (addr) {
 		addr = PAGE_ALIGN(addr);
 		vma = find_vma(mm, addr);
-		if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+		if (mm->task_size - len >= addr && addr >= mmap_min_addr &&
 				(!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
@@ -152,7 +166,14 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
 	info.low_limit = max(PAGE_SIZE, mmap_min_addr);
 	info.high_limit = mm->mmap_base;
 	info.align_mask = 0;
+
+	if (addr > DEFAULT_MAP_WINDOW)
+		info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW;
+
 	addr = vm_unmapped_area(&info);
+	if (!(addr & ~PAGE_MASK))
+		return addr;
+	VM_BUG_ON(addr != -ENOMEM);
 
 	/*
 	 * A failed mmap() very likely causes application failure,
@@ -160,15 +181,7 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
 	 * can happen with large stack limits and large mmap()
 	 * allocations.
 	 */
-	if (addr & ~PAGE_MASK) {
-		VM_BUG_ON(addr != -ENOMEM);
-		info.flags = 0;
-		info.low_limit = TASK_UNMAPPED_BASE;
-		info.high_limit = TASK_SIZE;
-		addr = vm_unmapped_area(&info);
-	}
-
-	return addr;
+	return radix__arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
 }
 
 static void radix__arch_pick_mmap_layout(struct mm_struct *mm,
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index 73bf6e14c3aa..c6dca2ae78ef 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -30,17 +30,16 @@
 static DEFINE_SPINLOCK(mmu_context_lock);
 static DEFINE_IDA(mmu_context_ida);
 
-int __init_new_context(void)
+static int alloc_context_id(int min_id, int max_id)
 {
-	int index;
-	int err;
+	int index, err;
 
 again:
 	if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL))
 		return -ENOMEM;
 
 	spin_lock(&mmu_context_lock);
-	err = ida_get_new_above(&mmu_context_ida, 1, &index);
+	err = ida_get_new_above(&mmu_context_ida, min_id, &index);
 	spin_unlock(&mmu_context_lock);
 
 	if (err == -EAGAIN)
@@ -48,7 +47,7 @@ again:
 	else if (err)
 		return err;
 
-	if (index > MAX_USER_CONTEXT) {
+	if (index > max_id) {
 		spin_lock(&mmu_context_lock);
 		ida_remove(&mmu_context_ida, index);
 		spin_unlock(&mmu_context_lock);
@@ -57,48 +56,105 @@ again:
 
 	return index;
 }
-EXPORT_SYMBOL_GPL(__init_new_context);
-static int radix__init_new_context(struct mm_struct *mm, int index)
+
+void hash__reserve_context_id(int id)
+{
+	int rc, result = 0;
+
+	do {
+		if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL))
+			break;
+
+		spin_lock(&mmu_context_lock);
+		rc = ida_get_new_above(&mmu_context_ida, id, &result);
+		spin_unlock(&mmu_context_lock);
+	} while (rc == -EAGAIN);
+
+	WARN(result != id, "mmu: Failed to reserve context id %d (rc %d)\n", id, result);
+}
+
+int hash__alloc_context_id(void)
+{
+	unsigned long max;
+
+	if (mmu_has_feature(MMU_FTR_68_BIT_VA))
+		max = MAX_USER_CONTEXT;
+	else
+		max = MAX_USER_CONTEXT_65BIT_VA;
+
+	return alloc_context_id(MIN_USER_CONTEXT, max);
+}
+EXPORT_SYMBOL_GPL(hash__alloc_context_id);
+
+static int hash__init_new_context(struct mm_struct *mm)
+{
+	int index;
+
+	index = hash__alloc_context_id();
+	if (index < 0)
+		return index;
+
+	/*
+	 * We do switch_slb() early in fork, even before we setup the
+	 * mm->context.addr_limit. Default to max task size so that we copy the
+	 * default values to paca which will help us to handle slb miss early.
+	 */
+	mm->context.addr_limit = TASK_SIZE_128TB;
+
+	/*
+	 * The old code would re-promote on fork, we don't do that when using
+	 * slices as it could cause problem promoting slices that have been
+	 * forced down to 4K.
+	 *
+	 * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check
+	 * explicitly against context.id == 0. This ensures that we properly
+	 * initialize context slice details for newly allocated mm's (which will
+	 * have id == 0) and don't alter context slice inherited via fork (which
+	 * will have id != 0).
+	 *
+	 * We should not be calling init_new_context() on init_mm. Hence a
+	 * check against 0 is OK.
+	 */
+	if (mm->context.id == 0)
+		slice_set_user_psize(mm, mmu_virtual_psize);
+
+	subpage_prot_init_new_context(mm);
+
+	return index;
+}
+
+static int radix__init_new_context(struct mm_struct *mm)
 {
 	unsigned long rts_field;
+	int index;
+
+	index = alloc_context_id(1, PRTB_ENTRIES - 1);
+	if (index < 0)
+		return index;
 
 	/*
 	 * set the process table entry,
 	 */
 	rts_field = radix__get_tree_size();
 	process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE);
-	return 0;
+
+	mm->context.npu_context = NULL;
+
+	return index;
 }
 
 int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 {
 	int index;
 
-	index = __init_new_context();
+	if (radix_enabled())
+		index = radix__init_new_context(mm);
+	else
+		index = hash__init_new_context(mm);
+
 	if (index < 0)
 		return index;
 
-	if (radix_enabled()) {
-		radix__init_new_context(mm, index);
-	} else {
-
-		/* The old code would re-promote on fork, we don't do that
-		 * when using slices as it could cause problem promoting slices
-		 * that have been forced down to 4K
-		 *
-		 * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check
-		 * explicitly against context.id == 0. This ensures that we
-		 * properly initialize context slice details for newly allocated
-		 * mm's (which will have id == 0) and don't alter context slice
-		 * inherited via fork (which will have id != 0).
-		 *
-		 * We should not be calling init_new_context() on init_mm. Hence a
-		 * check against 0 is ok.
-		 */
-		if (mm->context.id == 0)
-			slice_set_user_psize(mm, mmu_virtual_psize);
-		subpage_prot_init_new_context(mm);
-	}
 	mm->context.id = index;
 #ifdef CONFIG_PPC_ICSWX
 	mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index 497130c5c742..e0a2d8e806ed 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -81,7 +81,7 @@ struct page *new_iommu_non_cma_page(struct page *page, unsigned long private,
 	gfp_t gfp_mask = GFP_USER;
 	struct page *new_page;
 
-	if (PageHuge(page) || PageTransHuge(page) || PageCompound(page))
+	if (PageCompound(page))
 		return NULL;
 
 	if (PageHighMem(page))
@@ -100,7 +100,7 @@ static int mm_iommu_move_page_from_cma(struct page *page)
 	LIST_HEAD(cma_migrate_pages);
 
 	/* Ignore huge pages for now */
-	if (PageHuge(page) || PageTransHuge(page) || PageCompound(page))
+	if (PageCompound(page))
 		return -EBUSY;
 
 	lru_add_drain();
@@ -314,6 +314,25 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
 }
 EXPORT_SYMBOL_GPL(mm_iommu_lookup);
 
+struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm,
+		unsigned long ua, unsigned long size)
+{
+	struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+	list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list,
+			next) {
+		if ((mem->ua <= ua) &&
+				(ua + size <= mem->ua +
+				 (mem->entries << PAGE_SHIFT))) {
+			ret = mem;
+			break;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_lookup_rm);
+
 struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
 		unsigned long ua, unsigned long entries)
 {
@@ -345,6 +364,26 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
 }
 EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
 
+long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
+		unsigned long ua, unsigned long *hpa)
+{
+	const long entry = (ua - mem->ua) >> PAGE_SHIFT;
+	void *va = &mem->hpas[entry];
+	unsigned long *pa;
+
+	if (entry >= mem->entries)
+		return -EFAULT;
+
+	pa = (void *) vmalloc_to_phys(va);
+	if (!pa)
+		return -EFAULT;
+
+	*hpa = *pa | (ua & ~PAGE_MASK);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa_rm);
+
 long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
 {
 	if (atomic64_inc_not_zero(&mem->mapped))
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index c491f2c8f2b9..4554d6527682 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -333,11 +333,6 @@ int init_new_context(struct task_struct *t, struct mm_struct *mm)
 
 	mm->context.id = MMU_NO_CONTEXT;
 	mm->context.active = 0;
-
-#ifdef CONFIG_PPC_MM_SLICES
-	slice_set_user_psize(mm, mmu_virtual_psize);
-#endif
-
 	return 0;
 }
 
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 9befaee237d6..371792e4418f 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -875,13 +875,6 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
 	void *nd;
 	int tnid;
 
-	if (spanned_pages)
-		pr_info("Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
-			nid, start_pfn << PAGE_SHIFT,
-			(end_pfn << PAGE_SHIFT) - 1);
-	else
-		pr_info("Initmem setup node %d\n", nid);
-
 	nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
 	nd = __va(nd_pa);
 
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 5e01b2ece1d0..654a0d7ba0e7 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -131,7 +131,7 @@ static void __slb_flush_and_rebolt(void)
 		     "slbmte	%2,%3\n"
 		     "isync"
 		     :: "r"(mk_vsid_data(VMALLOC_START, mmu_kernel_ssize, vflags)),
-		        "r"(mk_esid_data(VMALLOC_START, mmu_kernel_ssize, 1)),
+		        "r"(mk_esid_data(VMALLOC_START, mmu_kernel_ssize, VMALLOC_INDEX)),
 		        "r"(ksp_vsid_data),
 		        "r"(ksp_esid_data)
 		     : "memory");
@@ -229,7 +229,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
 		asm volatile("slbie %0" : : "r" (slbie_data));
 
 	get_paca()->slb_cache_ptr = 0;
-	copy_mm_to_paca(&mm->context);
+	copy_mm_to_paca(mm);
 
 	/*
 	 * preload some userspace segments into the SLB.
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index a85e06ea6c20..1519617aab36 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -23,6 +23,48 @@
 #include <asm/pgtable.h>
 #include <asm/firmware.h>
 
+/*
+ * This macro generates asm code to compute the VSID scramble
+ * function.  Used in slb_allocate() and do_stab_bolted.  The function
+ * computed is: (protovsid*VSID_MULTIPLIER) % VSID_MODULUS
+ *
+ *	rt = register containing the proto-VSID and into which the
+ *		VSID will be stored
+ *	rx = scratch register (clobbered)
+ *	rf = flags
+ *
+ *	- rt and rx must be different registers
+ *	- The answer will end up in the low VSID_BITS bits of rt.  The higher
+ *	  bits may contain other garbage, so you may need to mask the
+ *	  result.
+ */
+#define ASM_VSID_SCRAMBLE(rt, rx, rf, size)				\
+	lis	rx,VSID_MULTIPLIER_##size@h;				\
+	ori	rx,rx,VSID_MULTIPLIER_##size@l;				\
+	mulld	rt,rt,rx;		/* rt = rt * MULTIPLIER */	\
+/*									\
+ * powermac get slb fault before feature fixup, so make 65 bit part     \
+ * the default part of feature fixup					\
+ */									\
+BEGIN_MMU_FTR_SECTION							\
+	srdi	rx,rt,VSID_BITS_65_##size;				\
+	clrldi	rt,rt,(64-VSID_BITS_65_##size);				\
+	add	rt,rt,rx;						\
+	addi	rx,rt,1;						\
+	srdi	rx,rx,VSID_BITS_65_##size;				\
+	add	rt,rt,rx;						\
+	rldimi	rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_65_##size)); \
+MMU_FTR_SECTION_ELSE							\
+	srdi	rx,rt,VSID_BITS_##size;					\
+	clrldi	rt,rt,(64-VSID_BITS_##size);				\
+	add	rt,rt,rx;		/* add high and low bits */	\
+	addi	rx,rt,1;						\
+	srdi	rx,rx,VSID_BITS_##size;	/* extract 2^VSID_BITS bit */	\
+	add	rt,rt,rx;						\
+	rldimi	rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_##size)); \
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_68_BIT_VA)
+
+
 /* void slb_allocate_realmode(unsigned long ea);
  *
  * Create an SLB entry for the given EA (user or kernel).
@@ -45,13 +87,6 @@ _GLOBAL(slb_allocate_realmode)
 	/* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */
 	blt	cr7,0f			/* user or kernel? */
 
-	/* kernel address: proto-VSID = ESID */
-	/* WARNING - MAGIC: we don't use the VSID 0xfffffffff, but
-	 * this code will generate the protoVSID 0xfffffffff for the
-	 * top segment.  That's ok, the scramble below will translate
-	 * it to VSID 0, which is reserved as a bad VSID - one which
-	 * will never have any pages in it.  */
-
 	/* Check if hitting the linear mapping or some other kernel space
 	*/
 	bne	cr7,1f
@@ -63,12 +98,10 @@ _GLOBAL(slb_allocate_realmode)
 slb_miss_kernel_load_linear:
 	li	r11,0
 	/*
-	 * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
+	 * context = (ea >> 60) - (0xc - 1)
 	 * r9 = region id.
 	 */
-	addis	r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha
-	addi	r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l
-
+	subi	r9,r9,KERNEL_REGION_CONTEXT_OFFSET
 
 BEGIN_FTR_SECTION
 	b	.Lslb_finish_load
@@ -77,9 +110,9 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
 
 1:
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
-	/* Check virtual memmap region. To be patches at kernel boot */
 	cmpldi	cr0,r9,0xf
 	bne	1f
+/* Check virtual memmap region. To be patched at kernel boot */
 .globl slb_miss_kernel_load_vmemmap
 slb_miss_kernel_load_vmemmap:
 	li	r11,0
@@ -102,11 +135,10 @@ slb_miss_kernel_load_io:
 	li	r11,0
 6:
 	/*
-	 * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
+	 * context = (ea >> 60) - (0xc - 1)
 	 * r9 = region id.
 	 */
-	addis	r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha
-	addi	r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l
+	subi	r9,r9,KERNEL_REGION_CONTEXT_OFFSET
 
 BEGIN_FTR_SECTION
 	b	.Lslb_finish_load
@@ -117,7 +149,13 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
 	 * For userspace addresses, make sure this is region 0.
 	 */
 	cmpdi	r9, 0
-	bne	8f
+	bne-	8f
+        /*
+         * user space make sure we are within the allowed limit
+	 */
+	ld	r11,PACA_ADDR_LIMIT(r13)
+	cmpld	r3,r11
+	bge-	8f
 
 	/* when using slices, we extract the psize off the slice bitmaps
 	 * and then we need to get the sllp encoding off the mmu_psize_defs
@@ -189,13 +227,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
  */
 .Lslb_finish_load:
 	rldimi  r10,r9,ESID_BITS,0
-	ASM_VSID_SCRAMBLE(r10,r9,256M)
-	/*
-	 * bits above VSID_BITS_256M need to be ignored from r10
-	 * also combine VSID and flags
-	 */
-	rldimi	r11,r10,SLB_VSID_SHIFT,(64 - (SLB_VSID_SHIFT + VSID_BITS_256M))
-
+	ASM_VSID_SCRAMBLE(r10,r9,r11,256M)
 	/* r3 = EA, r11 = VSID data */
 	/*
 	 * Find a slot, round robin. Previously we tried to find a
@@ -259,12 +291,12 @@ slb_compare_rr_to_size:
 .Lslb_finish_load_1T:
 	srdi	r10,r10,(SID_SHIFT_1T - SID_SHIFT)	/* get 1T ESID */
 	rldimi  r10,r9,ESID_BITS_1T,0
-	ASM_VSID_SCRAMBLE(r10,r9,1T)
+	ASM_VSID_SCRAMBLE(r10,r9,r11,1T)
 	/*
 	 * bits above VSID_BITS_1T need to be ignored from r10
 	 * also combine VSID and flags
 	 */
-	rldimi	r11,r10,SLB_VSID_SHIFT_1T,(64 - (SLB_VSID_SHIFT_1T + VSID_BITS_1T))
+
 	li	r10,MMU_SEGSIZE_1T
 	rldimi	r11,r10,SLB_VSID_SSIZE_SHIFT,0	/* insert segment size */
 
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 2b27458902ee..966b9fccfa66 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -36,38 +36,29 @@
 #include <asm/copro.h>
 #include <asm/hugetlb.h>
 
-/* some sanity checks */
-#if (H_PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE
-#error H_PGTABLE_RANGE exceeds slice_mask high_slices size
-#endif
-
 static DEFINE_SPINLOCK(slice_convert_lock);
-
+/*
+ * One bit per slice. We have lower slices which cover 256MB segments
+ * upto 4G range. That gets us 16 low slices. For the rest we track slices
+ * in 1TB size.
+ */
+struct slice_mask {
+	u64 low_slices;
+	DECLARE_BITMAP(high_slices, SLICE_NUM_HIGH);
+};
 
 #ifdef DEBUG
 int _slice_debug = 1;
 
 static void slice_print_mask(const char *label, struct slice_mask mask)
 {
-	char	*p, buf[16 + 3 + 64 + 1];
-	int	i;
-
 	if (!_slice_debug)
 		return;
-	p = buf;
-	for (i = 0; i < SLICE_NUM_LOW; i++)
-		*(p++) = (mask.low_slices & (1 << i)) ? '1' : '0';
-	*(p++) = ' ';
-	*(p++) = '-';
-	*(p++) = ' ';
-	for (i = 0; i < SLICE_NUM_HIGH; i++)
-		*(p++) = (mask.high_slices & (1ul << i)) ? '1' : '0';
-	*(p++) = 0;
-
-	printk(KERN_DEBUG "%s:%s\n", label, buf);
+	pr_devel("%s low_slice: %*pbl\n", label, (int)SLICE_NUM_LOW, &mask.low_slices);
+	pr_devel("%s high_slice: %*pbl\n", label, (int)SLICE_NUM_HIGH, mask.high_slices);
 }
 
-#define slice_dbg(fmt...) do { if (_slice_debug) pr_debug(fmt); } while(0)
+#define slice_dbg(fmt...) do { if (_slice_debug) pr_devel(fmt); } while (0)
 
 #else
 
@@ -76,25 +67,28 @@ static void slice_print_mask(const char *label, struct slice_mask mask) {}
 
 #endif
 
-static struct slice_mask slice_range_to_mask(unsigned long start,
-					     unsigned long len)
+static void slice_range_to_mask(unsigned long start, unsigned long len,
+				struct slice_mask *ret)
 {
 	unsigned long end = start + len - 1;
-	struct slice_mask ret = { 0, 0 };
+
+	ret->low_slices = 0;
+	bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
 
 	if (start < SLICE_LOW_TOP) {
-		unsigned long mend = min(end, SLICE_LOW_TOP);
-		unsigned long mstart = min(start, SLICE_LOW_TOP);
+		unsigned long mend = min(end, (SLICE_LOW_TOP - 1));
 
-		ret.low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1))
-			- (1u << GET_LOW_SLICE_INDEX(mstart));
+		ret->low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1))
+			- (1u << GET_LOW_SLICE_INDEX(start));
 	}
 
-	if ((start + len) > SLICE_LOW_TOP)
-		ret.high_slices = (1ul << (GET_HIGH_SLICE_INDEX(end) + 1))
-			- (1ul << GET_HIGH_SLICE_INDEX(start));
+	if ((start + len) > SLICE_LOW_TOP) {
+		unsigned long start_index = GET_HIGH_SLICE_INDEX(start);
+		unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT));
+		unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index;
 
-	return ret;
+		bitmap_set(ret->high_slices, start_index, count);
+	}
 }
 
 static int slice_area_is_free(struct mm_struct *mm, unsigned long addr,
@@ -128,53 +122,60 @@ static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice)
 	return !slice_area_is_free(mm, start, end - start);
 }
 
-static struct slice_mask slice_mask_for_free(struct mm_struct *mm)
+static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret)
 {
-	struct slice_mask ret = { 0, 0 };
 	unsigned long i;
 
+	ret->low_slices = 0;
+	bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
+
 	for (i = 0; i < SLICE_NUM_LOW; i++)
 		if (!slice_low_has_vma(mm, i))
-			ret.low_slices |= 1u << i;
+			ret->low_slices |= 1u << i;
 
 	if (mm->task_size <= SLICE_LOW_TOP)
-		return ret;
+		return;
 
-	for (i = 0; i < SLICE_NUM_HIGH; i++)
+	for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++)
 		if (!slice_high_has_vma(mm, i))
-			ret.high_slices |= 1ul << i;
-
-	return ret;
+			__set_bit(i, ret->high_slices);
 }
 
-static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize)
+static void slice_mask_for_size(struct mm_struct *mm, int psize, struct slice_mask *ret)
 {
 	unsigned char *hpsizes;
 	int index, mask_index;
-	struct slice_mask ret = { 0, 0 };
 	unsigned long i;
 	u64 lpsizes;
 
+	ret->low_slices = 0;
+	bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
+
 	lpsizes = mm->context.low_slices_psize;
 	for (i = 0; i < SLICE_NUM_LOW; i++)
 		if (((lpsizes >> (i * 4)) & 0xf) == psize)
-			ret.low_slices |= 1u << i;
+			ret->low_slices |= 1u << i;
 
 	hpsizes = mm->context.high_slices_psize;
-	for (i = 0; i < SLICE_NUM_HIGH; i++) {
+	for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) {
 		mask_index = i & 0x1;
 		index = i >> 1;
 		if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize)
-			ret.high_slices |= 1ul << i;
+			__set_bit(i, ret->high_slices);
 	}
-
-	return ret;
 }
 
-static int slice_check_fit(struct slice_mask mask, struct slice_mask available)
+static int slice_check_fit(struct mm_struct *mm,
+			   struct slice_mask mask, struct slice_mask available)
 {
+	DECLARE_BITMAP(result, SLICE_NUM_HIGH);
+	unsigned long slice_count = GET_HIGH_SLICE_INDEX(mm->context.addr_limit);
+
+	bitmap_and(result, mask.high_slices,
+		   available.high_slices, slice_count);
+
 	return (mask.low_slices & available.low_slices) == mask.low_slices &&
-		(mask.high_slices & available.high_slices) == mask.high_slices;
+		bitmap_equal(result, mask.high_slices, slice_count);
 }
 
 static void slice_flush_segments(void *parm)
@@ -185,7 +186,7 @@ static void slice_flush_segments(void *parm)
 	if (mm != current->active_mm)
 		return;
 
-	copy_mm_to_paca(&current->active_mm->context);
+	copy_mm_to_paca(current->active_mm);
 
 	local_irq_save(flags);
 	slb_flush_and_rebolt();
@@ -218,18 +219,18 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
 	mm->context.low_slices_psize = lpsizes;
 
 	hpsizes = mm->context.high_slices_psize;
-	for (i = 0; i < SLICE_NUM_HIGH; i++) {
+	for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) {
 		mask_index = i & 0x1;
 		index = i >> 1;
-		if (mask.high_slices & (1ul << i))
+		if (test_bit(i, mask.high_slices))
 			hpsizes[index] = (hpsizes[index] &
 					  ~(0xf << (mask_index * 4))) |
 				(((unsigned long)psize) << (mask_index * 4));
 	}
 
 	slice_dbg(" lsps=%lx, hsps=%lx\n",
-		  mm->context.low_slices_psize,
-		  mm->context.high_slices_psize);
+		  (unsigned long)mm->context.low_slices_psize,
+		  (unsigned long)mm->context.high_slices_psize);
 
 	spin_unlock_irqrestore(&slice_convert_lock, flags);
 
@@ -257,14 +258,14 @@ static bool slice_scan_available(unsigned long addr,
 		slice = GET_HIGH_SLICE_INDEX(addr);
 		*boundary_addr = (slice + end) ?
 			((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP;
-		return !!(available.high_slices & (1ul << slice));
+		return !!test_bit(slice, available.high_slices);
 	}
 }
 
 static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
 					      unsigned long len,
 					      struct slice_mask available,
-					      int psize)
+					      int psize, unsigned long high_limit)
 {
 	int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
 	unsigned long addr, found, next_end;
@@ -276,7 +277,10 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
 	info.align_offset = 0;
 
 	addr = TASK_UNMAPPED_BASE;
-	while (addr < TASK_SIZE) {
+	/*
+	 * Check till the allow max value for this mmap request
+	 */
+	while (addr < high_limit) {
 		info.low_limit = addr;
 		if (!slice_scan_available(addr, available, 1, &addr))
 			continue;
@@ -288,8 +292,8 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
 		 * Check if we need to reduce the range, or if we can
 		 * extend it to cover the next available slice.
 		 */
-		if (addr >= TASK_SIZE)
-			addr = TASK_SIZE;
+		if (addr >= high_limit)
+			addr = high_limit;
 		else if (slice_scan_available(addr, available, 1, &next_end)) {
 			addr = next_end;
 			goto next_slice;
@@ -307,7 +311,7 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
 static unsigned long slice_find_area_topdown(struct mm_struct *mm,
 					     unsigned long len,
 					     struct slice_mask available,
-					     int psize)
+					     int psize, unsigned long high_limit)
 {
 	int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
 	unsigned long addr, found, prev;
@@ -319,6 +323,15 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
 	info.align_offset = 0;
 
 	addr = mm->mmap_base;
+	/*
+	 * If we are trying to allocate above DEFAULT_MAP_WINDOW
+	 * Add the different to the mmap_base.
+	 * Only for that request for which high_limit is above
+	 * DEFAULT_MAP_WINDOW we should apply this.
+	 */
+	if (high_limit  > DEFAULT_MAP_WINDOW)
+		addr += mm->context.addr_limit - DEFAULT_MAP_WINDOW;
+
 	while (addr > PAGE_SIZE) {
 		info.high_limit = addr;
 		if (!slice_scan_available(addr - 1, available, 0, &addr))
@@ -350,29 +363,38 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
 	 * can happen with large stack limits and large mmap()
 	 * allocations.
 	 */
-	return slice_find_area_bottomup(mm, len, available, psize);
+	return slice_find_area_bottomup(mm, len, available, psize, high_limit);
 }
 
 
 static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
 				     struct slice_mask mask, int psize,
-				     int topdown)
+				     int topdown, unsigned long high_limit)
 {
 	if (topdown)
-		return slice_find_area_topdown(mm, len, mask, psize);
+		return slice_find_area_topdown(mm, len, mask, psize, high_limit);
 	else
-		return slice_find_area_bottomup(mm, len, mask, psize);
+		return slice_find_area_bottomup(mm, len, mask, psize, high_limit);
 }
 
-#define or_mask(dst, src)	do {			\
-	(dst).low_slices |= (src).low_slices;		\
-	(dst).high_slices |= (src).high_slices;		\
-} while (0)
+static inline void slice_or_mask(struct slice_mask *dst, struct slice_mask *src)
+{
+	DECLARE_BITMAP(result, SLICE_NUM_HIGH);
+
+	dst->low_slices |= src->low_slices;
+	bitmap_or(result, dst->high_slices, src->high_slices, SLICE_NUM_HIGH);
+	bitmap_copy(dst->high_slices, result, SLICE_NUM_HIGH);
+}
 
-#define andnot_mask(dst, src)	do {			\
-	(dst).low_slices &= ~(src).low_slices;		\
-	(dst).high_slices &= ~(src).high_slices;	\
-} while (0)
+static inline void slice_andnot_mask(struct slice_mask *dst, struct slice_mask *src)
+{
+	DECLARE_BITMAP(result, SLICE_NUM_HIGH);
+
+	dst->low_slices &= ~src->low_slices;
+
+	bitmap_andnot(result, dst->high_slices, src->high_slices, SLICE_NUM_HIGH);
+	bitmap_copy(dst->high_slices, result, SLICE_NUM_HIGH);
+}
 
 #ifdef CONFIG_PPC_64K_PAGES
 #define MMU_PAGE_BASE	MMU_PAGE_64K
@@ -384,14 +406,43 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 				      unsigned long flags, unsigned int psize,
 				      int topdown)
 {
-	struct slice_mask mask = {0, 0};
+	struct slice_mask mask;
 	struct slice_mask good_mask;
-	struct slice_mask potential_mask = {0,0} /* silence stupid warning */;
-	struct slice_mask compat_mask = {0, 0};
+	struct slice_mask potential_mask;
+	struct slice_mask compat_mask;
 	int fixed = (flags & MAP_FIXED);
 	int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
 	struct mm_struct *mm = current->mm;
 	unsigned long newaddr;
+	unsigned long high_limit;
+
+	/*
+	 * Check if we need to expland slice area.
+	 */
+	if (unlikely(addr > mm->context.addr_limit &&
+		     mm->context.addr_limit != TASK_SIZE)) {
+		mm->context.addr_limit = TASK_SIZE;
+		on_each_cpu(slice_flush_segments, mm, 1);
+	}
+	/*
+	 * This mmap request can allocate upt to 512TB
+	 */
+	if (addr > DEFAULT_MAP_WINDOW)
+		high_limit = mm->context.addr_limit;
+	else
+		high_limit = DEFAULT_MAP_WINDOW;
+	/*
+	 * init different masks
+	 */
+	mask.low_slices = 0;
+	bitmap_zero(mask.high_slices, SLICE_NUM_HIGH);
+
+	/* silence stupid warning */;
+	potential_mask.low_slices = 0;
+	bitmap_zero(potential_mask.high_slices, SLICE_NUM_HIGH);
+
+	compat_mask.low_slices = 0;
+	bitmap_zero(compat_mask.high_slices, SLICE_NUM_HIGH);
 
 	/* Sanity checks */
 	BUG_ON(mm->task_size == 0);
@@ -423,7 +474,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	/* First make up a "good" mask of slices that have the right size
 	 * already
 	 */
-	good_mask = slice_mask_for_size(mm, psize);
+	slice_mask_for_size(mm, psize, &good_mask);
 	slice_print_mask(" good_mask", good_mask);
 
 	/*
@@ -448,22 +499,22 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 #ifdef CONFIG_PPC_64K_PAGES
 	/* If we support combo pages, we can allow 64k pages in 4k slices */
 	if (psize == MMU_PAGE_64K) {
-		compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K);
+		slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask);
 		if (fixed)
-			or_mask(good_mask, compat_mask);
+			slice_or_mask(&good_mask, &compat_mask);
 	}
 #endif
 
 	/* First check hint if it's valid or if we have MAP_FIXED */
 	if (addr != 0 || fixed) {
 		/* Build a mask for the requested range */
-		mask = slice_range_to_mask(addr, len);
+		slice_range_to_mask(addr, len, &mask);
 		slice_print_mask(" mask", mask);
 
 		/* Check if we fit in the good mask. If we do, we just return,
 		 * nothing else to do
 		 */
-		if (slice_check_fit(mask, good_mask)) {
+		if (slice_check_fit(mm, mask, good_mask)) {
 			slice_dbg(" fits good !\n");
 			return addr;
 		}
@@ -471,7 +522,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 		/* Now let's see if we can find something in the existing
 		 * slices for that size
 		 */
-		newaddr = slice_find_area(mm, len, good_mask, psize, topdown);
+		newaddr = slice_find_area(mm, len, good_mask,
+					  psize, topdown, high_limit);
 		if (newaddr != -ENOMEM) {
 			/* Found within the good mask, we don't have to setup,
 			 * we thus return directly
@@ -484,11 +536,11 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	/* We don't fit in the good mask, check what other slices are
 	 * empty and thus can be converted
 	 */
-	potential_mask = slice_mask_for_free(mm);
-	or_mask(potential_mask, good_mask);
+	slice_mask_for_free(mm, &potential_mask);
+	slice_or_mask(&potential_mask, &good_mask);
 	slice_print_mask(" potential", potential_mask);
 
-	if ((addr != 0 || fixed) && slice_check_fit(mask, potential_mask)) {
+	if ((addr != 0 || fixed) && slice_check_fit(mm, mask, potential_mask)) {
 		slice_dbg(" fits potential !\n");
 		goto convert;
 	}
@@ -503,7 +555,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	 * anywhere in the good area.
 	 */
 	if (addr) {
-		addr = slice_find_area(mm, len, good_mask, psize, topdown);
+		addr = slice_find_area(mm, len, good_mask,
+				       psize, topdown, high_limit);
 		if (addr != -ENOMEM) {
 			slice_dbg(" found area at 0x%lx\n", addr);
 			return addr;
@@ -513,28 +566,29 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	/* Now let's see if we can find something in the existing slices
 	 * for that size plus free slices
 	 */
-	addr = slice_find_area(mm, len, potential_mask, psize, topdown);
+	addr = slice_find_area(mm, len, potential_mask,
+			       psize, topdown, high_limit);
 
 #ifdef CONFIG_PPC_64K_PAGES
 	if (addr == -ENOMEM && psize == MMU_PAGE_64K) {
 		/* retry the search with 4k-page slices included */
-		or_mask(potential_mask, compat_mask);
-		addr = slice_find_area(mm, len, potential_mask, psize,
-				       topdown);
+		slice_or_mask(&potential_mask, &compat_mask);
+		addr = slice_find_area(mm, len, potential_mask,
+				       psize, topdown, high_limit);
 	}
 #endif
 
 	if (addr == -ENOMEM)
 		return -ENOMEM;
 
-	mask = slice_range_to_mask(addr, len);
+	slice_range_to_mask(addr, len, &mask);
 	slice_dbg(" found potential area at 0x%lx\n", addr);
 	slice_print_mask(" mask", mask);
 
  convert:
-	andnot_mask(mask, good_mask);
-	andnot_mask(mask, compat_mask);
-	if (mask.low_slices || mask.high_slices) {
+	slice_andnot_mask(&mask, &good_mask);
+	slice_andnot_mask(&mask, &compat_mask);
+	if (mask.low_slices || !bitmap_empty(mask.high_slices, SLICE_NUM_HIGH)) {
 		slice_convert(mm, mask, psize);
 		if (psize > MMU_PAGE_BASE)
 			on_each_cpu(slice_flush_segments, mm, 1);
@@ -649,8 +703,8 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
 
 
 	slice_dbg(" lsps=%lx, hsps=%lx\n",
-		  mm->context.low_slices_psize,
-		  mm->context.high_slices_psize);
+		  (unsigned long)mm->context.low_slices_psize,
+		  (unsigned long)mm->context.high_slices_psize);
 
  bail:
 	spin_unlock_irqrestore(&slice_convert_lock, flags);
@@ -659,9 +713,11 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
 void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
 			   unsigned long len, unsigned int psize)
 {
-	struct slice_mask mask = slice_range_to_mask(start, len);
+	struct slice_mask mask;
 
 	VM_BUG_ON(radix_enabled());
+
+	slice_range_to_mask(start, len, &mask);
 	slice_convert(mm, mask, psize);
 }
 
@@ -694,14 +750,14 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 	if (radix_enabled())
 		return 0;
 
-	mask = slice_range_to_mask(addr, len);
-	available = slice_mask_for_size(mm, psize);
+	slice_range_to_mask(addr, len, &mask);
+	slice_mask_for_size(mm, psize, &available);
 #ifdef CONFIG_PPC_64K_PAGES
 	/* We need to account for 4k slices too */
 	if (psize == MMU_PAGE_64K) {
 		struct slice_mask compat_mask;
-		compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K);
-		or_mask(available, compat_mask);
+		slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask);
+		slice_or_mask(&available, &compat_mask);
 	}
 #endif
 
@@ -711,6 +767,6 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
 	slice_print_mask(" mask", mask);
 	slice_print_mask(" available", available);
 #endif
-	return !slice_check_fit(mask, available);
+	return !slice_check_fit(mm, mask, available);
 }
 #endif
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index 94210940112f..e94fbd4c8845 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -197,7 +197,8 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
 
 	/* Check parameters */
 	if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) ||
-	    addr >= TASK_SIZE || len >= TASK_SIZE || addr + len > TASK_SIZE)
+	    addr >= mm->task_size || len >= mm->task_size ||
+	    addr + len > mm->task_size)
 		return -EINVAL;
 
 	if (is_hugepage_only_range(mm, addr, len))
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 952713d6cf04..02e71402fdd3 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -17,7 +17,6 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
-static DEFINE_RAW_SPINLOCK(native_tlbie_lock);
 
 #define RIC_FLUSH_TLB 0
 #define RIC_FLUSH_PWC 1
@@ -34,10 +33,8 @@ static inline void __tlbiel_pid(unsigned long pid, int set,
 	prs = 1; /* process scoped */
 	r = 1;   /* raidx format */
 
-	asm volatile("ptesync": : :"memory");
 	asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
 		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-	asm volatile("ptesync": : :"memory");
 }
 
 /*
@@ -47,9 +44,33 @@ static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
 {
 	int set;
 
-	for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) {
+	asm volatile("ptesync": : :"memory");
+
+	/*
+	 * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
+	 * also flush the entire Page Walk Cache.
+	 */
+	__tlbiel_pid(pid, 0, ric);
+
+	if (ric == RIC_FLUSH_ALL)
+		/* For the remaining sets, just flush the TLB */
+		ric = RIC_FLUSH_TLB;
+
+	for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
 		__tlbiel_pid(pid, set, ric);
-	}
+
+	asm volatile("ptesync": : :"memory");
+	asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
+}
+
+static inline void tlbiel_pwc(unsigned long pid)
+{
+	asm volatile("ptesync": : :"memory");
+
+	/* For PWC flush, we don't look at set number */
+	__tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
+
+	asm volatile("ptesync": : :"memory");
 	asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
 }
 
@@ -129,12 +150,18 @@ void radix__local_flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
 {
 	unsigned long pid;
 	struct mm_struct *mm = tlb->mm;
+	/*
+	 * If we are doing a full mm flush, we will do a tlb flush
+	 * with RIC_FLUSH_ALL later.
+	 */
+	if (tlb->fullmm)
+		return;
 
 	preempt_disable();
 
 	pid = mm->context.id;
 	if (pid != MMU_NO_CONTEXT)
-		_tlbiel_pid(pid, RIC_FLUSH_PWC);
+		tlbiel_pwc(pid);
 
 	preempt_enable();
 }
@@ -175,15 +202,9 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
 	if (unlikely(pid == MMU_NO_CONTEXT))
 		goto no_context;
 
-	if (!mm_is_thread_local(mm)) {
-		int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
-
-		if (lock_tlbie)
-			raw_spin_lock(&native_tlbie_lock);
+	if (!mm_is_thread_local(mm))
 		_tlbie_pid(pid, RIC_FLUSH_ALL);
-		if (lock_tlbie)
-			raw_spin_unlock(&native_tlbie_lock);
-	} else
+	else
 		_tlbiel_pid(pid, RIC_FLUSH_ALL);
 no_context:
 	preempt_enable();
@@ -195,22 +216,22 @@ void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
 	unsigned long pid;
 	struct mm_struct *mm = tlb->mm;
 
+	/*
+	 * If we are doing a full mm flush, we will do a tlb flush
+	 * with RIC_FLUSH_ALL later.
+	 */
+	if (tlb->fullmm)
+		return;
 	preempt_disable();
 
 	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
 		goto no_context;
 
-	if (!mm_is_thread_local(mm)) {
-		int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
-
-		if (lock_tlbie)
-			raw_spin_lock(&native_tlbie_lock);
+	if (!mm_is_thread_local(mm))
 		_tlbie_pid(pid, RIC_FLUSH_PWC);
-		if (lock_tlbie)
-			raw_spin_unlock(&native_tlbie_lock);
-	} else
-		_tlbiel_pid(pid, RIC_FLUSH_PWC);
+	else
+		tlbiel_pwc(pid);
 no_context:
 	preempt_enable();
 }
@@ -226,15 +247,9 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
 	pid = mm ? mm->context.id : 0;
 	if (unlikely(pid == MMU_NO_CONTEXT))
 		goto bail;
-	if (!mm_is_thread_local(mm)) {
-		int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
-
-		if (lock_tlbie)
-			raw_spin_lock(&native_tlbie_lock);
+	if (!mm_is_thread_local(mm))
 		_tlbie_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
-		if (lock_tlbie)
-			raw_spin_unlock(&native_tlbie_lock);
-	} else
+	else
 		_tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
 bail:
 	preempt_enable();
@@ -255,13 +270,7 @@ EXPORT_SYMBOL(radix__flush_tlb_page);
 
 void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
-
-	if (lock_tlbie)
-		raw_spin_lock(&native_tlbie_lock);
 	_tlbie_pid(0, RIC_FLUSH_ALL);
-	if (lock_tlbie)
-		raw_spin_unlock(&native_tlbie_lock);
 }
 EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
 
@@ -323,7 +332,6 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 	unsigned long addr;
 	int local = mm_is_thread_local(mm);
 	unsigned long ap = mmu_get_ap(psize);
-	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
 	unsigned long page_size = 1UL << mmu_psize_defs[psize].shift;
 
 
@@ -344,13 +352,8 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 
 		if (local)
 			_tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
-		else {
-			if (lock_tlbie)
-				raw_spin_lock(&native_tlbie_lock);
+		else
 			_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
-			if (lock_tlbie)
-				raw_spin_unlock(&native_tlbie_lock);
-		}
 	}
 err_out:
 	preempt_enable();
@@ -437,7 +440,7 @@ void radix__flush_tlb_pte_p9_dd1(unsigned long old_pte, struct mm_struct *mm,
 		return;
 	}
 
-	if (old_pte & _PAGE_LARGE)
+	if (old_pte & R_PAGE_LARGE)
 		radix__flush_tlb_page_psize(mm, address, MMU_PAGE_2M);
 	else
 		radix__flush_tlb_page_psize(mm, address, mmu_virtual_psize);
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index ba28fcb98597..bfc4a0869609 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -770,7 +770,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 	 * avoid going over total available memory just in case...
 	 */
 #ifdef CONFIG_PPC_FSL_BOOK3E
-	if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
+	if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
 		unsigned long linear_sz;
 		unsigned int num_cams;
 
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 2ff13249f87a..6c2d4168daec 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2049,6 +2049,14 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 			data.br_stack = &cpuhw->bhrb_stack;
 		}
 
+		if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC &&
+						ppmu->get_mem_data_src)
+			ppmu->get_mem_data_src(&data.data_src, ppmu->flags, regs);
+
+		if (event->attr.sample_type & PERF_SAMPLE_WEIGHT &&
+						ppmu->get_mem_weight)
+			ppmu->get_mem_weight(&data.weight);
+
 		if (perf_event_overflow(event, &data, regs))
 			power_pmu_stop(event, 0);
 	}
diff --git a/arch/powerpc/perf/isa207-common.c b/arch/powerpc/perf/isa207-common.c
index cd951fd231c4..8125160be7bc 100644
--- a/arch/powerpc/perf/isa207-common.c
+++ b/arch/powerpc/perf/isa207-common.c
@@ -148,6 +148,88 @@ static bool is_thresh_cmp_valid(u64 event)
 	return true;
 }
 
+static inline u64 isa207_find_source(u64 idx, u32 sub_idx)
+{
+	u64 ret = PERF_MEM_NA;
+
+	switch(idx) {
+	case 0:
+		/* Nothing to do */
+		break;
+	case 1:
+		ret = PH(LVL, L1);
+		break;
+	case 2:
+		ret = PH(LVL, L2);
+		break;
+	case 3:
+		ret = PH(LVL, L3);
+		break;
+	case 4:
+		if (sub_idx <= 1)
+			ret = PH(LVL, LOC_RAM);
+		else if (sub_idx > 1 && sub_idx <= 2)
+			ret = PH(LVL, REM_RAM1);
+		else
+			ret = PH(LVL, REM_RAM2);
+		ret |= P(SNOOP, HIT);
+		break;
+	case 5:
+		ret = PH(LVL, REM_CCE1);
+		if ((sub_idx == 0) || (sub_idx == 2) || (sub_idx == 4))
+			ret |= P(SNOOP, HIT);
+		else if ((sub_idx == 1) || (sub_idx == 3) || (sub_idx == 5))
+			ret |= P(SNOOP, HITM);
+		break;
+	case 6:
+		ret = PH(LVL, REM_CCE2);
+		if ((sub_idx == 0) || (sub_idx == 2))
+			ret |= P(SNOOP, HIT);
+		else if ((sub_idx == 1) || (sub_idx == 3))
+			ret |= P(SNOOP, HITM);
+		break;
+	case 7:
+		ret = PM(LVL, L1);
+		break;
+	}
+
+	return ret;
+}
+
+void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags,
+							struct pt_regs *regs)
+{
+	u64 idx;
+	u32 sub_idx;
+	u64 sier;
+	u64 val;
+
+	/* Skip if no SIER support */
+	if (!(flags & PPMU_HAS_SIER)) {
+		dsrc->val = 0;
+		return;
+	}
+
+	sier = mfspr(SPRN_SIER);
+	val = (sier & ISA207_SIER_TYPE_MASK) >> ISA207_SIER_TYPE_SHIFT;
+	if (val == 1 || val == 2) {
+		idx = (sier & ISA207_SIER_LDST_MASK) >> ISA207_SIER_LDST_SHIFT;
+		sub_idx = (sier & ISA207_SIER_DATA_SRC_MASK) >> ISA207_SIER_DATA_SRC_SHIFT;
+
+		dsrc->val = isa207_find_source(idx, sub_idx);
+		dsrc->val |= (val == 1) ? P(OP, LOAD) : P(OP, STORE);
+	}
+}
+
+void isa207_get_mem_weight(u64 *weight)
+{
+	u64 mmcra = mfspr(SPRN_MMCRA);
+	u64 exp = MMCRA_THR_CTR_EXP(mmcra);
+	u64 mantissa = MMCRA_THR_CTR_MANT(mmcra);
+
+	*weight = mantissa << (2 * exp);
+}
+
 int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
 {
 	unsigned int unit, pmc, cache, ebb;
diff --git a/arch/powerpc/perf/isa207-common.h b/arch/powerpc/perf/isa207-common.h
index 899210f14ee4..8acbe6e802c7 100644
--- a/arch/powerpc/perf/isa207-common.h
+++ b/arch/powerpc/perf/isa207-common.h
@@ -248,6 +248,15 @@
 #define MMCRA_SDAR_MODE_TLB		(1ull << MMCRA_SDAR_MODE_SHIFT)
 #define MMCRA_SDAR_MODE_NO_UPDATES	~(0x3ull << MMCRA_SDAR_MODE_SHIFT)
 #define MMCRA_IFM_SHIFT			30
+#define MMCRA_THR_CTR_MANT_SHIFT	19
+#define MMCRA_THR_CTR_MANT_MASK		0x7Ful
+#define MMCRA_THR_CTR_MANT(v)		(((v) >> MMCRA_THR_CTR_MANT_SHIFT) &\
+						MMCRA_THR_CTR_MANT_MASK)
+
+#define MMCRA_THR_CTR_EXP_SHIFT		27
+#define MMCRA_THR_CTR_EXP_MASK		0x7ul
+#define MMCRA_THR_CTR_EXP(v)		(((v) >> MMCRA_THR_CTR_EXP_SHIFT) &\
+						MMCRA_THR_CTR_EXP_MASK)
 
 /* MMCR1 Threshold Compare bit constant for power9 */
 #define p9_MMCRA_THR_CMP_SHIFT	45
@@ -260,6 +269,19 @@
 #define MAX_ALT				2
 #define MAX_PMU_COUNTERS		6
 
+#define ISA207_SIER_TYPE_SHIFT		15
+#define ISA207_SIER_TYPE_MASK		(0x7ull << ISA207_SIER_TYPE_SHIFT)
+
+#define ISA207_SIER_LDST_SHIFT		1
+#define ISA207_SIER_LDST_MASK		(0x7ull << ISA207_SIER_LDST_SHIFT)
+
+#define ISA207_SIER_DATA_SRC_SHIFT	53
+#define ISA207_SIER_DATA_SRC_MASK	(0x7ull << ISA207_SIER_DATA_SRC_SHIFT)
+
+#define P(a, b)				PERF_MEM_S(a, b)
+#define PH(a, b)			(P(LVL, HIT) | P(a, b))
+#define PM(a, b)			(P(LVL, MISS) | P(a, b))
+
 int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp);
 int isa207_compute_mmcr(u64 event[], int n_ev,
 				unsigned int hwc[], unsigned long mmcr[],
@@ -267,6 +289,8 @@ int isa207_compute_mmcr(u64 event[], int n_ev,
 void isa207_disable_pmc(unsigned int pmc, unsigned long mmcr[]);
 int isa207_get_alternatives(u64 event, u64 alt[],
 				const unsigned int ev_alt[][MAX_ALT], int size);
-
+void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags,
+							struct pt_regs *regs);
+void isa207_get_mem_weight(u64 *weight);
 
 #endif
diff --git a/arch/powerpc/perf/power8-events-list.h b/arch/powerpc/perf/power8-events-list.h
index 3a2e6e8ebb92..0f1d184627cc 100644
--- a/arch/powerpc/perf/power8-events-list.h
+++ b/arch/powerpc/perf/power8-events-list.h
@@ -89,3 +89,9 @@ EVENT(PM_MRK_FILT_MATCH,			0x2013c)
 EVENT(PM_MRK_FILT_MATCH_ALT,			0x3012e)
 /* Alternate event code for PM_LD_MISS_L1 */
 EVENT(PM_LD_MISS_L1_ALT,			0x400f0)
+/*
+ * Memory Access Event -- mem_access
+ * Primary PMU event used here is PM_MRK_INST_CMPL, along with
+ * Random Load/Store Facility Sampling (RIS) in Random sampling mode (MMCRA[SM]).
+ */
+EVENT(MEM_ACCESS,				0x10401e0)
diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c
index ce15b19a7962..5463516e369b 100644
--- a/arch/powerpc/perf/power8-pmu.c
+++ b/arch/powerpc/perf/power8-pmu.c
@@ -90,6 +90,7 @@ GENERIC_EVENT_ATTR(branch-instructions,		PM_BRU_FIN);
 GENERIC_EVENT_ATTR(branch-misses,		PM_BR_MPRED_CMPL);
 GENERIC_EVENT_ATTR(cache-references,		PM_LD_REF_L1);
 GENERIC_EVENT_ATTR(cache-misses,		PM_LD_MISS_L1);
+GENERIC_EVENT_ATTR(mem_access,			MEM_ACCESS);
 
 CACHE_EVENT_ATTR(L1-dcache-load-misses,		PM_LD_MISS_L1);
 CACHE_EVENT_ATTR(L1-dcache-loads,		PM_LD_REF_L1);
@@ -120,6 +121,7 @@ static struct attribute *power8_events_attr[] = {
 	GENERIC_EVENT_PTR(PM_BR_MPRED_CMPL),
 	GENERIC_EVENT_PTR(PM_LD_REF_L1),
 	GENERIC_EVENT_PTR(PM_LD_MISS_L1),
+	GENERIC_EVENT_PTR(MEM_ACCESS),
 
 	CACHE_EVENT_PTR(PM_LD_MISS_L1),
 	CACHE_EVENT_PTR(PM_LD_REF_L1),
@@ -325,6 +327,8 @@ static struct power_pmu power8_pmu = {
 	.bhrb_filter_map	= power8_bhrb_filter_map,
 	.get_constraint		= isa207_get_constraint,
 	.get_alternatives	= power8_get_alternatives,
+	.get_mem_data_src	= isa207_get_mem_data_src,
+	.get_mem_weight		= isa207_get_mem_weight,
 	.disable_pmc		= isa207_disable_pmc,
 	.flags			= PPMU_HAS_SIER | PPMU_ARCH_207S,
 	.n_generic		= ARRAY_SIZE(power8_generic_events),
diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c
index 7f6582708e06..018f8e90ac35 100644
--- a/arch/powerpc/perf/power9-pmu.c
+++ b/arch/powerpc/perf/power9-pmu.c
@@ -427,6 +427,8 @@ static struct power_pmu power9_pmu = {
 	.bhrb_filter_map	= power9_bhrb_filter_map,
 	.get_constraint		= isa207_get_constraint,
 	.get_alternatives	= power9_get_alternatives,
+	.get_mem_data_src	= isa207_get_mem_data_src,
+	.get_mem_weight		= isa207_get_mem_weight,
 	.disable_pmc		= isa207_disable_pmc,
 	.flags			= PPMU_HAS_SIER | PPMU_ARCH_207S,
 	.n_generic		= ARRAY_SIZE(power9_generic_events),
diff --git a/arch/powerpc/platforms/44x/sam440ep.c b/arch/powerpc/platforms/44x/sam440ep.c
index 688ffeab0699..55fed5e4de14 100644
--- a/arch/powerpc/platforms/44x/sam440ep.c
+++ b/arch/powerpc/platforms/44x/sam440ep.c
@@ -70,7 +70,7 @@ static struct i2c_board_info sam440ep_rtc_info = {
 	.irq = -1,
 };
 
-static int sam440ep_setup_rtc(void)
+static int __init sam440ep_setup_rtc(void)
 {
 	return i2c_register_board_info(0, &sam440ep_rtc_info, 1);
 }
diff --git a/arch/powerpc/platforms/52xx/Kconfig b/arch/powerpc/platforms/52xx/Kconfig
index b625a2c6f4f2..e4c745981912 100644
--- a/arch/powerpc/platforms/52xx/Kconfig
+++ b/arch/powerpc/platforms/52xx/Kconfig
@@ -33,7 +33,6 @@ config PPC_EFIKA
 	bool "bPlan Efika 5k2. MPC5200B based computer"
 	depends on PPC_MPC52xx
 	select PPC_RTAS
-	select RTAS_PROC
 	select PPC_NATIVE
 
 config PPC_LITE5200
diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c
index 078097a0b09d..f51fd35f4618 100644
--- a/arch/powerpc/platforms/85xx/smp.c
+++ b/arch/powerpc/platforms/85xx/smp.c
@@ -344,6 +344,7 @@ done:
 }
 
 struct smp_ops_t smp_85xx_ops = {
+	.cause_nmi_ipi = NULL,
 	.kick_cpu = smp_85xx_kick_cpu,
 	.cpu_bootable = smp_generic_cpu_bootable,
 #ifdef CONFIG_HOTPLUG_CPU
@@ -461,16 +462,9 @@ static void mpc85xx_smp_machine_kexec(struct kimage *image)
 }
 #endif /* CONFIG_KEXEC_CORE */
 
-static void smp_85xx_basic_setup(int cpu_nr)
-{
-	if (cpu_has_feature(CPU_FTR_DBELL))
-		doorbell_setup_this_cpu();
-}
-
 static void smp_85xx_setup_cpu(int cpu_nr)
 {
 	mpic_setup_this_cpu();
-	smp_85xx_basic_setup(cpu_nr);
 }
 
 void __init mpc85xx_smp_init(void)
@@ -484,7 +478,7 @@ void __init mpc85xx_smp_init(void)
 		smp_85xx_ops.setup_cpu = smp_85xx_setup_cpu;
 		smp_85xx_ops.message_pass = smp_mpic_message_pass;
 	} else
-		smp_85xx_ops.setup_cpu = smp_85xx_basic_setup;
+		smp_85xx_ops.setup_cpu = NULL;
 
 	if (cpu_has_feature(CPU_FTR_DBELL)) {
 		/*
@@ -492,7 +486,7 @@ void __init mpc85xx_smp_init(void)
 		 * smp_muxed_ipi_message_pass
 		 */
 		smp_85xx_ops.message_pass = NULL;
-		smp_85xx_ops.cause_ipi = doorbell_cause_ipi;
+		smp_85xx_ops.cause_ipi = doorbell_global_ipi;
 		smp_85xx_ops.probe = NULL;
 	}
 
diff --git a/arch/powerpc/platforms/86xx/mpc86xx_smp.c b/arch/powerpc/platforms/86xx/mpc86xx_smp.c
index af09baee22cb..020e84a47a32 100644
--- a/arch/powerpc/platforms/86xx/mpc86xx_smp.c
+++ b/arch/powerpc/platforms/86xx/mpc86xx_smp.c
@@ -105,6 +105,7 @@ smp_86xx_setup_cpu(int cpu_nr)
 
 
 struct smp_ops_t smp_86xx_ops = {
+	.cause_nmi_ipi = NULL,
 	.message_pass = smp_mpic_message_pass,
 	.probe = smp_mpic_probe,
 	.kick_cpu = smp_86xx_kick_cpu,
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 99b0ae8acb78..684e886eaae4 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -279,7 +279,8 @@ config PPC_ICSWX
 
 	  This option enables kernel support for the PowerPC Initiate
 	  Coprocessor Store Word (icswx) coprocessor instruction on POWER7
-	  or newer processors.
+	  and POWER8 processors. POWER9 uses new copy/paste instructions
+	  to invoke the coprocessor.
 
 	  This option is only useful if you have a processor that supports
 	  the icswx coprocessor instruction. It does not have any effect
@@ -359,7 +360,7 @@ config PPC_BOOK3E_MMU
 
 config PPC_MM_SLICES
 	bool
-	default y if (!PPC_FSL_BOOK3E && PPC64 && HUGETLB_PAGE) || (PPC_STD_MMU_64 && PPC_64K_PAGES)
+	default y if PPC_STD_MMU_64
 	default n
 
 config PPC_HAVE_PMU_SUPPORT
@@ -371,9 +372,16 @@ config PPC_PERF_CTRS
        help
          This enables the powerpc-specific perf_event back-end.
 
+config FORCE_SMP
+	# Allow platforms to force SMP=y by selecting this
+	bool
+	default n
+	select SMP
+
 config SMP
 	depends on PPC_BOOK3S || PPC_BOOK3E || FSL_BOOKE || PPC_47x
-	bool "Symmetric multi-processing support"
+	select GENERIC_IRQ_MIGRATION
+	bool "Symmetric multi-processing support" if !FORCE_SMP
 	---help---
 	  This enables support for systems with more than one CPU. If you have
 	  a system with only one CPU, say N. If you have a system with more
diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c
index 8b55c5f19d4c..8d3ae2cc52bf 100644
--- a/arch/powerpc/platforms/cell/axon_msi.c
+++ b/arch/powerpc/platforms/cell/axon_msi.c
@@ -15,9 +15,9 @@
 #include <linux/msi.h>
 #include <linux/export.h>
 #include <linux/of_platform.h>
-#include <linux/debugfs.h>
 #include <linux/slab.h>
 
+#include <asm/debugfs.h>
 #include <asm/dcr.h>
 #include <asm/machdep.h>
 #include <asm/prom.h>
diff --git a/arch/powerpc/platforms/cell/interrupt.c b/arch/powerpc/platforms/cell/interrupt.c
index a6bbbaba14a3..871d38479a25 100644
--- a/arch/powerpc/platforms/cell/interrupt.c
+++ b/arch/powerpc/platforms/cell/interrupt.c
@@ -211,7 +211,7 @@ void iic_request_IPIs(void)
 	iic_request_ipi(PPC_MSG_CALL_FUNCTION);
 	iic_request_ipi(PPC_MSG_RESCHEDULE);
 	iic_request_ipi(PPC_MSG_TICK_BROADCAST);
-	iic_request_ipi(PPC_MSG_DEBUGGER_BREAK);
+	iic_request_ipi(PPC_MSG_NMI_IPI);
 }
 
 #endif /* CONFIG_SMP */
diff --git a/arch/powerpc/platforms/cell/pervasive.c b/arch/powerpc/platforms/cell/pervasive.c
index e7d075077cb0..a88944db9fc3 100644
--- a/arch/powerpc/platforms/cell/pervasive.c
+++ b/arch/powerpc/platforms/cell/pervasive.c
@@ -88,11 +88,14 @@ static void cbe_power_save(void)
 static int cbe_system_reset_exception(struct pt_regs *regs)
 {
 	switch (regs->msr & SRR1_WAKEMASK) {
-	case SRR1_WAKEEE:
-		do_IRQ(regs);
-		break;
 	case SRR1_WAKEDEC:
-		timer_interrupt(regs);
+		set_dec(1);
+	case SRR1_WAKEEE:
+		/*
+		 * Handle these when interrupts get re-enabled and we take
+		 * them as regular exceptions. We are in an NMI context
+		 * and can't handle these here.
+		 */
 		break;
 	case SRR1_WAKEMT:
 		return cbe_sysreset_hack();
diff --git a/arch/powerpc/platforms/chrp/smp.c b/arch/powerpc/platforms/chrp/smp.c
index b6c9a0dcc924..14515040f7cd 100644
--- a/arch/powerpc/platforms/chrp/smp.c
+++ b/arch/powerpc/platforms/chrp/smp.c
@@ -44,6 +44,7 @@ static void smp_chrp_setup_cpu(int cpu_nr)
 
 /* CHRP with openpic */
 struct smp_ops_t chrp_smp_ops = {
+	.cause_nmi_ipi = NULL,
 	.message_pass = smp_mpic_message_pass,
 	.probe = smp_mpic_probe,
 	.kick_cpu = smp_chrp_kick_cpu,
diff --git a/arch/powerpc/platforms/pasemi/idle.c b/arch/powerpc/platforms/pasemi/idle.c
index 75b296bc51af..44e0d9226f0a 100644
--- a/arch/powerpc/platforms/pasemi/idle.c
+++ b/arch/powerpc/platforms/pasemi/idle.c
@@ -53,11 +53,14 @@ static int pasemi_system_reset_exception(struct pt_regs *regs)
 		regs->nip = regs->link;
 
 	switch (regs->msr & SRR1_WAKEMASK) {
-	case SRR1_WAKEEE:
-		do_IRQ(regs);
-		break;
 	case SRR1_WAKEDEC:
-		timer_interrupt(regs);
+		set_dec(1);
+	case SRR1_WAKEEE:
+		/*
+		 * Handle these when interrupts get re-enabled and we take
+		 * them as regular exceptions. We are in an NMI context
+		 * and can't handle these here.
+		 */
 		break;
 	default:
 		/* do system reset */
diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c
index 746ca7321b03..2cd99eb30762 100644
--- a/arch/powerpc/platforms/powermac/smp.c
+++ b/arch/powerpc/platforms/powermac/smp.c
@@ -172,7 +172,7 @@ static irqreturn_t psurge_ipi_intr(int irq, void *d)
 	return IRQ_HANDLED;
 }
 
-static void smp_psurge_cause_ipi(int cpu, unsigned long data)
+static void smp_psurge_cause_ipi(int cpu)
 {
 	psurge_set_ipi(cpu);
 }
@@ -447,6 +447,7 @@ void __init smp_psurge_give_timebase(void)
 struct smp_ops_t psurge_smp_ops = {
 	.message_pass	= NULL,	/* Use smp_muxed_ipi_message_pass */
 	.cause_ipi	= smp_psurge_cause_ipi,
+	.cause_nmi_ipi	= NULL,
 	.probe		= smp_psurge_probe,
 	.kick_cpu	= smp_psurge_kick_cpu,
 	.setup_cpu	= smp_psurge_setup_cpu,
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index 3a07e4dcf97c..6a6f4ef46b9e 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -4,6 +4,7 @@ config PPC_POWERNV
 	select PPC_NATIVE
 	select PPC_XICS
 	select PPC_ICP_NATIVE
+	select PPC_XIVE_NATIVE
 	select PPC_P7_NAP
 	select PCI
 	select PCI_MSI
@@ -19,6 +20,8 @@ config PPC_POWERNV
 	select CPU_FREQ_GOV_ONDEMAND
 	select CPU_FREQ_GOV_CONSERVATIVE
 	select PPC_DOORBELL
+	select MMU_NOTIFIER
+	select FORCE_SMP
 	default y
 
 config OPAL_PRD
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 6fb5522acd70..d2f19821d71d 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1102,6 +1102,13 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option)
 		return -EIO;
 	}
 
+	/*
+	 * If dealing with the root bus (or the bus underneath the
+	 * root port), we reset the bus underneath the root port.
+	 *
+	 * The cxl driver depends on this behaviour for bi-modal card
+	 * switching.
+	 */
 	if (pci_is_root_bus(bus) ||
 	    pci_is_root_bus(bus->parent))
 		return pnv_eeh_root_reset(hose, option);
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 4ee837e6391a..445f30a2c5ef 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -53,19 +53,6 @@ static int pnv_save_sprs_for_deep_states(void)
 		uint64_t pir = get_hard_smp_processor_id(cpu);
 		uint64_t hsprg0_val = (uint64_t)&paca[cpu];
 
-		if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
-			/*
-			 * HSPRG0 is used to store the cpu's pointer to paca.
-			 * Hence last 3 bits are guaranteed to be 0. Program
-			 * slw to restore HSPRG0 with 63rd bit set, so that
-			 * when a thread wakes up at 0x100 we can use this bit
-			 * to distinguish between fastsleep and deep winkle.
-			 * This is not necessary with stop/psscr since PLS
-			 * field of psscr indicates which state we are waking
-			 * up from.
-			 */
-			hsprg0_val |= 1;
-		}
 		rc = opal_slw_set_reg(pir, SPRN_HSPRG0, hsprg0_val);
 		if (rc != 0)
 			return rc;
@@ -122,9 +109,12 @@ static void pnv_alloc_idle_core_states(void)
 	for (i = 0; i < nr_cores; i++) {
 		int first_cpu = i * threads_per_core;
 		int node = cpu_to_node(first_cpu);
+		size_t paca_ptr_array_size;
 
 		core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node);
 		*core_idle_state = PNV_CORE_IDLE_THREAD_BITS;
+		paca_ptr_array_size = (threads_per_core *
+				       sizeof(struct paca_struct *));
 
 		for (j = 0; j < threads_per_core; j++) {
 			int cpu = first_cpu + j;
@@ -132,6 +122,11 @@ static void pnv_alloc_idle_core_states(void)
 			paca[cpu].core_idle_state_ptr = core_idle_state;
 			paca[cpu].thread_idle_state = PNV_THREAD_RUNNING;
 			paca[cpu].thread_mask = 1 << j;
+			if (!cpu_has_feature(CPU_FTR_POWER9_DD1))
+				continue;
+			paca[cpu].thread_sibling_pacas =
+				kmalloc_node(paca_ptr_array_size,
+					     GFP_KERNEL, node);
 		}
 	}
 
@@ -147,7 +142,6 @@ u32 pnv_get_supported_cpuidle_states(void)
 }
 EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states);
 
-
 static void pnv_fastsleep_workaround_apply(void *info)
 
 {
@@ -241,8 +235,9 @@ static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600,
  * The default stop state that will be used by ppc_md.power_save
  * function on platforms that support stop instruction.
  */
-u64 pnv_default_stop_val;
-u64 pnv_default_stop_mask;
+static u64 pnv_default_stop_val;
+static u64 pnv_default_stop_mask;
+static bool default_stop_found;
 
 /*
  * Used for ppc_md.power_save which needs a function with no parameters
@@ -262,8 +257,42 @@ u64 pnv_first_deep_stop_state = MAX_STOP_STATE;
  * psscr value and mask of the deepest stop idle state.
  * Used when a cpu is offlined.
  */
-u64 pnv_deepest_stop_psscr_val;
-u64 pnv_deepest_stop_psscr_mask;
+static u64 pnv_deepest_stop_psscr_val;
+static u64 pnv_deepest_stop_psscr_mask;
+static bool deepest_stop_found;
+
+/*
+ * pnv_cpu_offline: A function that puts the CPU into the deepest
+ * available platform idle state on a CPU-Offline.
+ */
+unsigned long pnv_cpu_offline(unsigned int cpu)
+{
+	unsigned long srr1;
+
+	u32 idle_states = pnv_get_supported_cpuidle_states();
+
+	if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) {
+		srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val,
+					pnv_deepest_stop_psscr_mask);
+	} else if (idle_states & OPAL_PM_WINKLE_ENABLED) {
+		srr1 = power7_winkle();
+	} else if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
+		   (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
+		srr1 = power7_sleep();
+	} else if (idle_states & OPAL_PM_NAP_ENABLED) {
+		srr1 = power7_nap(1);
+	} else {
+		/* This is the fallback method. We emulate snooze */
+		while (!generic_check_cpu_restart(cpu)) {
+			HMT_low();
+			HMT_very_low();
+		}
+		srr1 = 0;
+		HMT_medium();
+	}
+
+	return srr1;
+}
 
 /*
  * Power ISA 3.0 idle initialization.
@@ -352,7 +381,6 @@ static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags,
 	u32 *residency_ns = NULL;
 	u64 max_residency_ns = 0;
 	int rc = 0, i;
-	bool default_stop_found = false, deepest_stop_found = false;
 
 	psscr_val = kcalloc(dt_idle_states, sizeof(*psscr_val), GFP_KERNEL);
 	psscr_mask = kcalloc(dt_idle_states, sizeof(*psscr_mask), GFP_KERNEL);
@@ -432,21 +460,24 @@ static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags,
 		}
 	}
 
-	if (!default_stop_found) {
-		pnv_default_stop_val = PSSCR_HV_DEFAULT_VAL;
-		pnv_default_stop_mask = PSSCR_HV_DEFAULT_MASK;
-		pr_warn("Setting default stop psscr val=0x%016llx,mask=0x%016llx\n",
+	if (unlikely(!default_stop_found)) {
+		pr_warn("cpuidle-powernv: No suitable default stop state found. Disabling platform idle.\n");
+	} else {
+		ppc_md.power_save = power9_idle;
+		pr_info("cpuidle-powernv: Default stop: psscr = 0x%016llx,mask=0x%016llx\n",
 			pnv_default_stop_val, pnv_default_stop_mask);
 	}
 
-	if (!deepest_stop_found) {
-		pnv_deepest_stop_psscr_val = PSSCR_HV_DEFAULT_VAL;
-		pnv_deepest_stop_psscr_mask = PSSCR_HV_DEFAULT_MASK;
-		pr_warn("Setting default stop psscr val=0x%016llx,mask=0x%016llx\n",
+	if (unlikely(!deepest_stop_found)) {
+		pr_warn("cpuidle-powernv: No suitable stop state for CPU-Hotplug. Offlined CPUs will busy wait");
+	} else {
+		pr_info("cpuidle-powernv: Deepest stop: psscr = 0x%016llx,mask=0x%016llx\n",
 			pnv_deepest_stop_psscr_val,
 			pnv_deepest_stop_psscr_mask);
 	}
 
+	pr_info("cpuidle-powernv: Requested Level (RL) value of first deep stop = 0x%llx\n",
+		pnv_first_deep_stop_state);
 out:
 	kfree(psscr_val);
 	kfree(psscr_mask);
@@ -524,10 +555,30 @@ static int __init pnv_init_idle_states(void)
 
 	pnv_alloc_idle_core_states();
 
+	/*
+	 * For each CPU, record its PACA address in each of it's
+	 * sibling thread's PACA at the slot corresponding to this
+	 * CPU's index in the core.
+	 */
+	if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
+		int cpu;
+
+		pr_info("powernv: idle: Saving PACA pointers of all CPUs in their thread sibling PACA\n");
+		for_each_possible_cpu(cpu) {
+			int base_cpu = cpu_first_thread_sibling(cpu);
+			int idx = cpu_thread_in_core(cpu);
+			int i;
+
+			for (i = 0; i < threads_per_core; i++) {
+				int j = base_cpu + i;
+
+				paca[j].thread_sibling_pacas[idx] = &paca[cpu];
+			}
+		}
+	}
+
 	if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED)
 		ppc_md.power_save = power7_idle;
-	else if (supported_cpuidle_states & OPAL_PM_STOP_INST_FAST)
-		ppc_md.power_save = power9_idle;
 
 out:
 	return 0;
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index 1c383f38031d..067defeea691 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -9,11 +9,20 @@
  * License as published by the Free Software Foundation.
  */
 
+#include <linux/slab.h>
+#include <linux/mmu_notifier.h>
+#include <linux/mmu_context.h>
+#include <linux/of.h>
 #include <linux/export.h>
 #include <linux/pci.h>
 #include <linux/memblock.h>
 #include <linux/iommu.h>
 
+#include <asm/tlb.h>
+#include <asm/powernv.h>
+#include <asm/reg.h>
+#include <asm/opal.h>
+#include <asm/io.h>
 #include <asm/iommu.h>
 #include <asm/pnv-pci.h>
 #include <asm/msi_bitmap.h>
@@ -22,6 +31,8 @@
 #include "powernv.h"
 #include "pci.h"
 
+#define npu_to_phb(x) container_of(x, struct pnv_phb, npu)
+
 /*
  * Other types of TCE cache invalidation are not functional in the
  * hardware.
@@ -37,6 +48,12 @@ struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev)
 	struct device_node *dn;
 	struct pci_dev *gpdev;
 
+	if (WARN_ON(!npdev))
+		return NULL;
+
+	if (WARN_ON(!npdev->dev.of_node))
+		return NULL;
+
 	/* Get assoicated PCI device */
 	dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0);
 	if (!dn)
@@ -55,6 +72,12 @@ struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index)
 	struct device_node *dn;
 	struct pci_dev *npdev;
 
+	if (WARN_ON(!gpdev))
+		return NULL;
+
+	if (WARN_ON(!gpdev->dev.of_node))
+		return NULL;
+
 	/* Get assoicated PCI device */
 	dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index);
 	if (!dn)
@@ -180,7 +203,7 @@ long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
 		pe_err(npe, "Failed to configure TCE table, err %lld\n", rc);
 		return rc;
 	}
-	pnv_pci_phb3_tce_invalidate_entire(phb, false);
+	pnv_pci_ioda2_tce_invalidate_entire(phb, false);
 
 	/* Add the table to the list so its TCE cache will get invalidated */
 	pnv_pci_link_table_and_group(phb->hose->node, num,
@@ -204,7 +227,7 @@ long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num)
 		pe_err(npe, "Unmapping failed, ret = %lld\n", rc);
 		return rc;
 	}
-	pnv_pci_phb3_tce_invalidate_entire(phb, false);
+	pnv_pci_ioda2_tce_invalidate_entire(phb, false);
 
 	pnv_pci_unlink_table_and_group(npe->table_group.tables[num],
 			&npe->table_group);
@@ -270,7 +293,7 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
 			0 /* bypass base */, top);
 
 	if (rc == OPAL_SUCCESS)
-		pnv_pci_phb3_tce_invalidate_entire(phb, false);
+		pnv_pci_ioda2_tce_invalidate_entire(phb, false);
 
 	return rc;
 }
@@ -334,7 +357,7 @@ void pnv_npu_take_ownership(struct pnv_ioda_pe *npe)
 		pe_err(npe, "Failed to disable bypass, err %lld\n", rc);
 		return;
 	}
-	pnv_pci_phb3_tce_invalidate_entire(npe->phb, false);
+	pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
 }
 
 struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
@@ -359,3 +382,442 @@ struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
 
 	return gpe;
 }
+
+/* Maximum number of nvlinks per npu */
+#define NV_MAX_LINKS 6
+
+/* Maximum index of npu2 hosts in the system. Always < NV_MAX_NPUS */
+static int max_npu2_index;
+
+struct npu_context {
+	struct mm_struct *mm;
+	struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS];
+	struct mmu_notifier mn;
+	struct kref kref;
+
+	/* Callback to stop translation requests on a given GPU */
+	struct npu_context *(*release_cb)(struct npu_context *, void *);
+
+	/*
+	 * Private pointer passed to the above callback for usage by
+	 * device drivers.
+	 */
+	void *priv;
+};
+
+/*
+ * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC
+ * if none are available.
+ */
+static int get_mmio_atsd_reg(struct npu *npu)
+{
+	int i;
+
+	for (i = 0; i < npu->mmio_atsd_count; i++) {
+		if (!test_and_set_bit(i, &npu->mmio_atsd_usage))
+			return i;
+	}
+
+	return -ENOSPC;
+}
+
+static void put_mmio_atsd_reg(struct npu *npu, int reg)
+{
+	clear_bit(reg, &npu->mmio_atsd_usage);
+}
+
+/* MMIO ATSD register offsets */
+#define XTS_ATSD_AVA  1
+#define XTS_ATSD_STAT 2
+
+static int mmio_launch_invalidate(struct npu *npu, unsigned long launch,
+				unsigned long va)
+{
+	int mmio_atsd_reg;
+
+	do {
+		mmio_atsd_reg = get_mmio_atsd_reg(npu);
+		cpu_relax();
+	} while (mmio_atsd_reg < 0);
+
+	__raw_writeq(cpu_to_be64(va),
+		npu->mmio_atsd_regs[mmio_atsd_reg] + XTS_ATSD_AVA);
+	eieio();
+	__raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[mmio_atsd_reg]);
+
+	return mmio_atsd_reg;
+}
+
+static int mmio_invalidate_pid(struct npu *npu, unsigned long pid)
+{
+	unsigned long launch;
+
+	/* IS set to invalidate matching PID */
+	launch = PPC_BIT(12);
+
+	/* PRS set to process-scoped */
+	launch |= PPC_BIT(13);
+
+	/* AP */
+	launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
+
+	/* PID */
+	launch |= pid << PPC_BITLSHIFT(38);
+
+	/* Invalidating the entire process doesn't use a va */
+	return mmio_launch_invalidate(npu, launch, 0);
+}
+
+static int mmio_invalidate_va(struct npu *npu, unsigned long va,
+			unsigned long pid)
+{
+	unsigned long launch;
+
+	/* IS set to invalidate target VA */
+	launch = 0;
+
+	/* PRS set to process scoped */
+	launch |= PPC_BIT(13);
+
+	/* AP */
+	launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
+
+	/* PID */
+	launch |= pid << PPC_BITLSHIFT(38);
+
+	return mmio_launch_invalidate(npu, launch, va);
+}
+
+#define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
+
+/*
+ * Invalidate either a single address or an entire PID depending on
+ * the value of va.
+ */
+static void mmio_invalidate(struct npu_context *npu_context, int va,
+			unsigned long address)
+{
+	int i, j, reg;
+	struct npu *npu;
+	struct pnv_phb *nphb;
+	struct pci_dev *npdev;
+	struct {
+		struct npu *npu;
+		int reg;
+	} mmio_atsd_reg[NV_MAX_NPUS];
+	unsigned long pid = npu_context->mm->context.id;
+
+	/*
+	 * Loop over all the NPUs this process is active on and launch
+	 * an invalidate.
+	 */
+	for (i = 0; i <= max_npu2_index; i++) {
+		mmio_atsd_reg[i].reg = -1;
+		for (j = 0; j < NV_MAX_LINKS; j++) {
+			npdev = npu_context->npdev[i][j];
+			if (!npdev)
+				continue;
+
+			nphb = pci_bus_to_host(npdev->bus)->private_data;
+			npu = &nphb->npu;
+			mmio_atsd_reg[i].npu = npu;
+
+			if (va)
+				mmio_atsd_reg[i].reg =
+					mmio_invalidate_va(npu, address, pid);
+			else
+				mmio_atsd_reg[i].reg =
+					mmio_invalidate_pid(npu, pid);
+
+			/*
+			 * The NPU hardware forwards the shootdown to all GPUs
+			 * so we only have to launch one shootdown per NPU.
+			 */
+			break;
+		}
+	}
+
+	/*
+	 * Unfortunately the nest mmu does not support flushing specific
+	 * addresses so we have to flush the whole mm.
+	 */
+	flush_tlb_mm(npu_context->mm);
+
+	/* Wait for all invalidations to complete */
+	for (i = 0; i <= max_npu2_index; i++) {
+		if (mmio_atsd_reg[i].reg < 0)
+			continue;
+
+		/* Wait for completion */
+		npu = mmio_atsd_reg[i].npu;
+		reg = mmio_atsd_reg[i].reg;
+		while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))
+			cpu_relax();
+		put_mmio_atsd_reg(npu, reg);
+	}
+}
+
+static void pnv_npu2_mn_release(struct mmu_notifier *mn,
+				struct mm_struct *mm)
+{
+	struct npu_context *npu_context = mn_to_npu_context(mn);
+
+	/* Call into device driver to stop requests to the NMMU */
+	if (npu_context->release_cb)
+		npu_context->release_cb(npu_context, npu_context->priv);
+
+	/*
+	 * There should be no more translation requests for this PID, but we
+	 * need to ensure any entries for it are removed from the TLB.
+	 */
+	mmio_invalidate(npu_context, 0, 0);
+}
+
+static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
+				struct mm_struct *mm,
+				unsigned long address,
+				pte_t pte)
+{
+	struct npu_context *npu_context = mn_to_npu_context(mn);
+
+	mmio_invalidate(npu_context, 1, address);
+}
+
+static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn,
+					struct mm_struct *mm,
+					unsigned long address)
+{
+	struct npu_context *npu_context = mn_to_npu_context(mn);
+
+	mmio_invalidate(npu_context, 1, address);
+}
+
+static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
+					struct mm_struct *mm,
+					unsigned long start, unsigned long end)
+{
+	struct npu_context *npu_context = mn_to_npu_context(mn);
+	unsigned long address;
+
+	for (address = start; address <= end; address += PAGE_SIZE)
+		mmio_invalidate(npu_context, 1, address);
+}
+
+static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
+	.release = pnv_npu2_mn_release,
+	.change_pte = pnv_npu2_mn_change_pte,
+	.invalidate_page = pnv_npu2_mn_invalidate_page,
+	.invalidate_range = pnv_npu2_mn_invalidate_range,
+};
+
+/*
+ * Call into OPAL to setup the nmmu context for the current task in
+ * the NPU. This must be called to setup the context tables before the
+ * GPU issues ATRs. pdev should be a pointed to PCIe GPU device.
+ *
+ * A release callback should be registered to allow a device driver to
+ * be notified that it should not launch any new translation requests
+ * as the final TLB invalidate is about to occur.
+ *
+ * Returns an error if there no contexts are currently available or a
+ * npu_context which should be passed to pnv_npu2_handle_fault().
+ *
+ * mmap_sem must be held in write mode.
+ */
+struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
+			unsigned long flags,
+			struct npu_context *(*cb)(struct npu_context *, void *),
+			void *priv)
+{
+	int rc;
+	u32 nvlink_index;
+	struct device_node *nvlink_dn;
+	struct mm_struct *mm = current->mm;
+	struct pnv_phb *nphb;
+	struct npu *npu;
+	struct npu_context *npu_context;
+
+	/*
+	 * At present we don't support GPUs connected to multiple NPUs and I'm
+	 * not sure the hardware does either.
+	 */
+	struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
+
+	if (!firmware_has_feature(FW_FEATURE_OPAL))
+		return ERR_PTR(-ENODEV);
+
+	if (!npdev)
+		/* No nvlink associated with this GPU device */
+		return ERR_PTR(-ENODEV);
+
+	if (!mm) {
+		/* kernel thread contexts are not supported */
+		return ERR_PTR(-EINVAL);
+	}
+
+	nphb = pci_bus_to_host(npdev->bus)->private_data;
+	npu = &nphb->npu;
+
+	/*
+	 * Setup the NPU context table for a particular GPU. These need to be
+	 * per-GPU as we need the tables to filter ATSDs when there are no
+	 * active contexts on a particular GPU.
+	 */
+	rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags,
+				PCI_DEVID(gpdev->bus->number, gpdev->devfn));
+	if (rc < 0)
+		return ERR_PTR(-ENOSPC);
+
+	/*
+	 * We store the npu pci device so we can more easily get at the
+	 * associated npus.
+	 */
+	npu_context = mm->context.npu_context;
+	if (!npu_context) {
+		npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL);
+		if (!npu_context)
+			return ERR_PTR(-ENOMEM);
+
+		mm->context.npu_context = npu_context;
+		npu_context->mm = mm;
+		npu_context->mn.ops = &nv_nmmu_notifier_ops;
+		__mmu_notifier_register(&npu_context->mn, mm);
+		kref_init(&npu_context->kref);
+	} else {
+		kref_get(&npu_context->kref);
+	}
+
+	npu_context->release_cb = cb;
+	npu_context->priv = priv;
+	nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
+	if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
+							&nvlink_index)))
+		return ERR_PTR(-ENODEV);
+	npu_context->npdev[npu->index][nvlink_index] = npdev;
+
+	return npu_context;
+}
+EXPORT_SYMBOL(pnv_npu2_init_context);
+
+static void pnv_npu2_release_context(struct kref *kref)
+{
+	struct npu_context *npu_context =
+		container_of(kref, struct npu_context, kref);
+
+	npu_context->mm->context.npu_context = NULL;
+	mmu_notifier_unregister(&npu_context->mn,
+				npu_context->mm);
+
+	kfree(npu_context);
+}
+
+void pnv_npu2_destroy_context(struct npu_context *npu_context,
+			struct pci_dev *gpdev)
+{
+	struct pnv_phb *nphb, *phb;
+	struct npu *npu;
+	struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
+	struct device_node *nvlink_dn;
+	u32 nvlink_index;
+
+	if (WARN_ON(!npdev))
+		return;
+
+	if (!firmware_has_feature(FW_FEATURE_OPAL))
+		return;
+
+	nphb = pci_bus_to_host(npdev->bus)->private_data;
+	npu = &nphb->npu;
+	phb = pci_bus_to_host(gpdev->bus)->private_data;
+	nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
+	if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
+							&nvlink_index)))
+		return;
+	npu_context->npdev[npu->index][nvlink_index] = NULL;
+	opal_npu_destroy_context(phb->opal_id, npu_context->mm->context.id,
+				PCI_DEVID(gpdev->bus->number, gpdev->devfn));
+	kref_put(&npu_context->kref, pnv_npu2_release_context);
+}
+EXPORT_SYMBOL(pnv_npu2_destroy_context);
+
+/*
+ * Assumes mmap_sem is held for the contexts associated mm.
+ */
+int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
+			unsigned long *flags, unsigned long *status, int count)
+{
+	u64 rc = 0, result = 0;
+	int i, is_write;
+	struct page *page[1];
+
+	/* mmap_sem should be held so the struct_mm must be present */
+	struct mm_struct *mm = context->mm;
+
+	if (!firmware_has_feature(FW_FEATURE_OPAL))
+		return -ENODEV;
+
+	WARN_ON(!rwsem_is_locked(&mm->mmap_sem));
+
+	for (i = 0; i < count; i++) {
+		is_write = flags[i] & NPU2_WRITE;
+		rc = get_user_pages_remote(NULL, mm, ea[i], 1,
+					is_write ? FOLL_WRITE : 0,
+					page, NULL, NULL);
+
+		/*
+		 * To support virtualised environments we will have to do an
+		 * access to the page to ensure it gets faulted into the
+		 * hypervisor. For the moment virtualisation is not supported in
+		 * other areas so leave the access out.
+		 */
+		if (rc != 1) {
+			status[i] = rc;
+			result = -EFAULT;
+			continue;
+		}
+
+		status[i] = 0;
+		put_page(page[0]);
+	}
+
+	return result;
+}
+EXPORT_SYMBOL(pnv_npu2_handle_fault);
+
+int pnv_npu2_init(struct pnv_phb *phb)
+{
+	unsigned int i;
+	u64 mmio_atsd;
+	struct device_node *dn;
+	struct pci_dev *gpdev;
+	static int npu_index;
+	uint64_t rc = 0;
+
+	for_each_child_of_node(phb->hose->dn, dn) {
+		gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn));
+		if (gpdev) {
+			rc = opal_npu_map_lpar(phb->opal_id,
+				PCI_DEVID(gpdev->bus->number, gpdev->devfn),
+				0, 0);
+			if (rc)
+				dev_err(&gpdev->dev,
+					"Error %lld mapping device to LPAR\n",
+					rc);
+		}
+	}
+
+	for (i = 0; !of_property_read_u64_index(phb->hose->dn, "ibm,mmio-atsd",
+							i, &mmio_atsd); i++)
+		phb->npu.mmio_atsd_regs[i] = ioremap(mmio_atsd, 32);
+
+	pr_info("NPU%lld: Found %d MMIO ATSD registers", phb->opal_id, i);
+	phb->npu.mmio_atsd_count = i;
+	phb->npu.mmio_atsd_usage = 0;
+	npu_index++;
+	if (WARN_ON(npu_index >= NV_MAX_NPUS))
+		return -ENOSPC;
+	max_npu2_index = npu_index;
+	phb->npu.index = npu_index;
+
+	return 0;
+}
diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c
index a91d7876fae2..6c7ad1d8b32e 100644
--- a/arch/powerpc/platforms/powernv/opal-lpc.c
+++ b/arch/powerpc/platforms/powernv/opal-lpc.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/of.h>
 #include <linux/bug.h>
-#include <linux/debugfs.h>
 #include <linux/io.h>
 #include <linux/slab.h>
 
@@ -21,7 +20,7 @@
 #include <asm/opal.h>
 #include <asm/prom.h>
 #include <linux/uaccess.h>
-#include <asm/debug.h>
+#include <asm/debugfs.h>
 #include <asm/isa-bridge.h>
 
 static int opal_lpc_chip_id = -1;
diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c
index 308efd170c27..aa267f120033 100644
--- a/arch/powerpc/platforms/powernv/opal-sensor.c
+++ b/arch/powerpc/platforms/powernv/opal-sensor.c
@@ -64,6 +64,10 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data)
 		*sensor_data = be32_to_cpu(data);
 		break;
 
+	case OPAL_WRONG_STATE:
+		ret = -EIO;
+		break;
+
 	default:
 		ret = opal_error_code(ret);
 		break;
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index da8a0f7a035c..f620572f891f 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -50,21 +50,13 @@ END_FTR_SECTION(0, 1);						\
 #define OPAL_BRANCH(LABEL)
 #endif
 
-/* TODO:
- *
- * - Trace irqs in/off (needs saving/restoring all args, argh...)
- * - Get r11 feed up by Dave so I can have better register usage
+/*
+ * DO_OPAL_CALL assumes:
+ * r0  = opal call token
+ * r12 = msr
+ * LR has been saved
  */
-
-#define OPAL_CALL(name, token)		\
- _GLOBAL_TOC(name);			\
-	mfmsr	r12;			\
-	mflr	r0;			\
-	andi.	r11,r12,MSR_IR|MSR_DR; 	\
-	std	r0,PPC_LR_STKOFF(r1);	\
-	li	r0,token;		\
-	beq	opal_real_call;         \
-	OPAL_BRANCH(opal_tracepoint_entry) \
+#define DO_OPAL_CALL()			\
 	mfcr	r11;			\
 	stw	r11,8(r1);		\
 	li	r11,0;			\
@@ -83,6 +75,18 @@ END_FTR_SECTION(0, 1);						\
 	mtspr	SPRN_HSRR0,r12;		\
 	hrfid
 
+#define OPAL_CALL(name, token)		\
+ _GLOBAL_TOC(name);			\
+	mfmsr	r12;			\
+	mflr	r0;			\
+	andi.	r11,r12,MSR_IR|MSR_DR; 	\
+	std	r0,PPC_LR_STKOFF(r1);	\
+	li	r0,token;		\
+	beq	opal_real_call;         \
+	OPAL_BRANCH(opal_tracepoint_entry) \
+	DO_OPAL_CALL()
+
+
 opal_return:
 	/*
 	 * Fixup endian on OPAL return... we should be able to simplify
@@ -148,26 +152,13 @@ opal_tracepoint_entry:
 	ld	r8,STK_REG(R29)(r1)
 	ld	r9,STK_REG(R30)(r1)
 	ld	r10,STK_REG(R31)(r1)
+
+	/* setup LR so we return via tracepoint_return */
 	LOAD_REG_ADDR(r11,opal_tracepoint_return)
-	mfcr	r12
 	std	r11,16(r1)
-	stw	r12,8(r1)
-	li	r11,0
+
 	mfmsr	r12
-	ori	r11,r11,MSR_EE
-	std	r12,PACASAVEDMSR(r13)
-	andc	r12,r12,r11
-	mtmsrd	r12,1
-	LOAD_REG_ADDR(r11,opal_return)
-	mtlr	r11
-	li	r11,MSR_DR|MSR_IR|MSR_LE
-	andc	r12,r12,r11
-	mtspr	SPRN_HSRR1,r12
-	LOAD_REG_ADDR(r11,opal)
-	ld	r12,8(r11)
-	ld	r2,0(r11)
-	mtspr	SPRN_HSRR0,r12
-	hrfid
+	DO_OPAL_CALL()
 
 opal_tracepoint_return:
 	std	r3,STK_REG(R31)(r1)
@@ -301,3 +292,21 @@ OPAL_CALL(opal_int_eoi,				OPAL_INT_EOI);
 OPAL_CALL(opal_int_set_mfrr,			OPAL_INT_SET_MFRR);
 OPAL_CALL(opal_pci_tce_kill,			OPAL_PCI_TCE_KILL);
 OPAL_CALL(opal_nmmu_set_ptcr,			OPAL_NMMU_SET_PTCR);
+OPAL_CALL(opal_xive_reset,			OPAL_XIVE_RESET);
+OPAL_CALL(opal_xive_get_irq_info,		OPAL_XIVE_GET_IRQ_INFO);
+OPAL_CALL(opal_xive_get_irq_config,		OPAL_XIVE_GET_IRQ_CONFIG);
+OPAL_CALL(opal_xive_set_irq_config,		OPAL_XIVE_SET_IRQ_CONFIG);
+OPAL_CALL(opal_xive_get_queue_info,		OPAL_XIVE_GET_QUEUE_INFO);
+OPAL_CALL(opal_xive_set_queue_info,		OPAL_XIVE_SET_QUEUE_INFO);
+OPAL_CALL(opal_xive_donate_page,		OPAL_XIVE_DONATE_PAGE);
+OPAL_CALL(opal_xive_alloc_vp_block,		OPAL_XIVE_ALLOCATE_VP_BLOCK);
+OPAL_CALL(opal_xive_free_vp_block,		OPAL_XIVE_FREE_VP_BLOCK);
+OPAL_CALL(opal_xive_allocate_irq,		OPAL_XIVE_ALLOCATE_IRQ);
+OPAL_CALL(opal_xive_free_irq,			OPAL_XIVE_FREE_IRQ);
+OPAL_CALL(opal_xive_get_vp_info,		OPAL_XIVE_GET_VP_INFO);
+OPAL_CALL(opal_xive_set_vp_info,		OPAL_XIVE_SET_VP_INFO);
+OPAL_CALL(opal_xive_sync,			OPAL_XIVE_SYNC);
+OPAL_CALL(opal_xive_dump,			OPAL_XIVE_DUMP);
+OPAL_CALL(opal_npu_init_context,		OPAL_NPU_INIT_CONTEXT);
+OPAL_CALL(opal_npu_destroy_context,		OPAL_NPU_DESTROY_CONTEXT);
+OPAL_CALL(opal_npu_map_lpar,			OPAL_NPU_MAP_LPAR);
diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c
index d0ac535cf5d7..28651fb25417 100644
--- a/arch/powerpc/platforms/powernv/opal-xscom.c
+++ b/arch/powerpc/platforms/powernv/opal-xscom.c
@@ -73,25 +73,32 @@ static int opal_xscom_err_xlate(int64_t rc)
 
 static u64 opal_scom_unmangle(u64 addr)
 {
+	u64 tmp;
+
 	/*
-	 * XSCOM indirect addresses have the top bit set. Additionally
-	 * the rest of the top 3 nibbles is always 0.
+	 * XSCOM addresses use the top nibble to set indirect mode and
+	 * its form.  Bits 4-11 are always 0.
 	 *
 	 * Because the debugfs interface uses signed offsets and shifts
 	 * the address left by 3, we basically cannot use the top 4 bits
 	 * of the 64-bit address, and thus cannot use the indirect bit.
 	 *
-	 * To deal with that, we support the indirect bit being in bit
-	 * 4 (IBM notation) instead of bit 0 in this API, we do the
-	 * conversion here. To leave room for further xscom address
-	 * expansion, we only clear out the top byte
+	 * To deal with that, we support the indirect bits being in
+	 * bits 4-7 (IBM notation) instead of bit 0-3 in this API, we
+	 * do the conversion here.
 	 *
-	 * For in-kernel use, we also support the real indirect bit, so
-	 * we test for any of the top 5 bits
+	 * For in-kernel use, we don't need to do this mangling.  In
+	 * kernel won't have bits 4-7 set.
 	 *
+	 * So:
+	 *   debugfs will always   set 0-3 = 0 and clear 4-7
+	 *    kernel will always clear 0-3 = 0 and   set 4-7
 	 */
-	if (addr & (0x1full << 59))
-		addr = (addr & ~(0xffull << 56)) | (1ull << 63);
+	tmp = addr;
+	tmp  &= 0x0f00000000000000;
+	addr &= 0xf0ffffffffffffff;
+	addr |= tmp << 4;
+
 	return addr;
 }
 
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index e0f856bfbfe8..7925a9d72cca 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -435,7 +435,7 @@ int opal_machine_check(struct pt_regs *regs)
 		       evt.version);
 		return 0;
 	}
-	machine_check_print_event_info(&evt);
+	machine_check_print_event_info(&evt, user_mode(regs));
 
 	if (opal_recover_mce(regs, &evt))
 		return 1;
@@ -595,6 +595,80 @@ static void opal_export_symmap(void)
 		pr_warn("Error %d creating OPAL symbols file\n", rc);
 }
 
+static ssize_t export_attr_read(struct file *fp, struct kobject *kobj,
+				struct bin_attribute *bin_attr, char *buf,
+				loff_t off, size_t count)
+{
+	return memory_read_from_buffer(buf, count, &off, bin_attr->private,
+				       bin_attr->size);
+}
+
+/*
+ * opal_export_attrs: creates a sysfs node for each property listed in
+ * the device-tree under /ibm,opal/firmware/exports/
+ * All new sysfs nodes are created under /opal/exports/.
+ * This allows for reserved memory regions (e.g. HDAT) to be read.
+ * The new sysfs nodes are only readable by root.
+ */
+static void opal_export_attrs(void)
+{
+	struct bin_attribute *attr;
+	struct device_node *np;
+	struct property *prop;
+	struct kobject *kobj;
+	u64 vals[2];
+	int rc;
+
+	np = of_find_node_by_path("/ibm,opal/firmware/exports");
+	if (!np)
+		return;
+
+	/* Create new 'exports' directory - /sys/firmware/opal/exports */
+	kobj = kobject_create_and_add("exports", opal_kobj);
+	if (!kobj) {
+		pr_warn("kobject_create_and_add() of exports failed\n");
+		return;
+	}
+
+	for_each_property_of_node(np, prop) {
+		if (!strcmp(prop->name, "name") || !strcmp(prop->name, "phandle"))
+			continue;
+
+		if (of_property_read_u64_array(np, prop->name, &vals[0], 2))
+			continue;
+
+		attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+
+		if (attr == NULL) {
+			pr_warn("Failed kmalloc for bin_attribute!");
+			continue;
+		}
+
+		sysfs_bin_attr_init(attr);
+		attr->attr.name = kstrdup(prop->name, GFP_KERNEL);
+		attr->attr.mode = 0400;
+		attr->read = export_attr_read;
+		attr->private = __va(vals[0]);
+		attr->size = vals[1];
+
+		if (attr->attr.name == NULL) {
+			pr_warn("Failed kstrdup for bin_attribute attr.name");
+			kfree(attr);
+			continue;
+		}
+
+		rc = sysfs_create_bin_file(kobj, attr);
+		if (rc) {
+			pr_warn("Error %d creating OPAL sysfs exports/%s file\n",
+				 rc, prop->name);
+			kfree(attr->attr.name);
+			kfree(attr);
+		}
+	}
+
+	of_node_put(np);
+}
+
 static void __init opal_dump_region_init(void)
 {
 	void *addr;
@@ -733,6 +807,9 @@ static int __init opal_init(void)
 		opal_msglog_sysfs_init();
 	}
 
+	/* Export all properties */
+	opal_export_attrs();
+
 	/* Initialize platform devices: IPMI backend, PRD & flash interface */
 	opal_pdev_init("ibm,opal-ipmi");
 	opal_pdev_init("ibm,opal-flash");
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index e36738291c32..6fdbd383f676 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -14,7 +14,6 @@
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/crash_dump.h>
-#include <linux/debugfs.h>
 #include <linux/delay.h>
 #include <linux/string.h>
 #include <linux/init.h>
@@ -38,7 +37,7 @@
 #include <asm/iommu.h>
 #include <asm/tce.h>
 #include <asm/xics.h>
-#include <asm/debug.h>
+#include <asm/debugfs.h>
 #include <asm/firmware.h>
 #include <asm/pnv-pci.h>
 #include <asm/mmzone.h>
@@ -1262,6 +1261,8 @@ static void pnv_pci_ioda_setup_PEs(void)
 			/* PE#0 is needed for error reporting */
 			pnv_ioda_reserve_pe(phb, 0);
 			pnv_ioda_setup_npu_PEs(hose->bus);
+			if (phb->model == PNV_PHB_MODEL_NPU2)
+				pnv_npu2_init(phb);
 		}
 	}
 }
@@ -1424,8 +1425,7 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
 		iommu_group_put(pe->table_group.group);
 		BUG_ON(pe->table_group.group);
 	}
-	pnv_pci_ioda2_table_free_pages(tbl);
-	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
+	iommu_tce_table_put(tbl);
 }
 
 static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
@@ -1860,6 +1860,17 @@ static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
 
 	return ret;
 }
+
+static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index,
+		unsigned long *hpa, enum dma_data_direction *direction)
+{
+	long ret = pnv_tce_xchg(tbl, index, hpa, direction);
+
+	if (!ret)
+		pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true);
+
+	return ret;
+}
 #endif
 
 static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
@@ -1874,6 +1885,7 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
 	.set = pnv_ioda1_tce_build,
 #ifdef CONFIG_IOMMU_API
 	.exchange = pnv_ioda1_tce_xchg,
+	.exchange_rm = pnv_ioda1_tce_xchg_rm,
 #endif
 	.clear = pnv_ioda1_tce_free,
 	.get = pnv_tce_get,
@@ -1883,7 +1895,7 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
 #define PHB3_TCE_KILL_INVAL_PE		PPC_BIT(1)
 #define PHB3_TCE_KILL_INVAL_ONE		PPC_BIT(2)
 
-void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
+static void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
 {
 	__be64 __iomem *invalidate = pnv_ioda_get_inval_reg(phb, rm);
 	const unsigned long val = PHB3_TCE_KILL_INVAL_ALL;
@@ -1948,7 +1960,7 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
 {
 	struct iommu_table_group_link *tgl;
 
-	list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
+	list_for_each_entry_lockless(tgl, &tbl->it_group_list, next) {
 		struct pnv_ioda_pe *pe = container_of(tgl->table_group,
 				struct pnv_ioda_pe, table_group);
 		struct pnv_phb *phb = pe->phb;
@@ -1979,6 +1991,14 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
 	}
 }
 
+void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
+{
+	if (phb->model == PNV_PHB_MODEL_NPU || phb->model == PNV_PHB_MODEL_PHB3)
+		pnv_pci_phb3_tce_invalidate_entire(phb, rm);
+	else
+		opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL, 0, 0, 0, 0);
+}
+
 static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
 		long npages, unsigned long uaddr,
 		enum dma_data_direction direction,
@@ -2004,6 +2024,17 @@ static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
 
 	return ret;
 }
+
+static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index,
+		unsigned long *hpa, enum dma_data_direction *direction)
+{
+	long ret = pnv_tce_xchg(tbl, index, hpa, direction);
+
+	if (!ret)
+		pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true);
+
+	return ret;
+}
 #endif
 
 static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
@@ -2017,13 +2048,13 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
 static void pnv_ioda2_table_free(struct iommu_table *tbl)
 {
 	pnv_pci_ioda2_table_free_pages(tbl);
-	iommu_free_table(tbl, "pnv");
 }
 
 static struct iommu_table_ops pnv_ioda2_iommu_ops = {
 	.set = pnv_ioda2_tce_build,
 #ifdef CONFIG_IOMMU_API
 	.exchange = pnv_ioda2_tce_xchg,
+	.exchange_rm = pnv_ioda2_tce_xchg_rm,
 #endif
 	.clear = pnv_ioda2_tce_free,
 	.get = pnv_tce_get,
@@ -2128,6 +2159,9 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
 
 found:
 	tbl = pnv_pci_table_alloc(phb->hose->node);
+	if (WARN_ON(!tbl))
+		return;
+
 	iommu_register_group(&pe->table_group, phb->hose->global_number,
 			pe->pe_number);
 	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
@@ -2203,7 +2237,7 @@ found:
 		__free_pages(tce_mem, get_order(tce32_segsz * segs));
 	if (tbl) {
 		pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
-		iommu_free_table(tbl, "pnv");
+		iommu_tce_table_put(tbl);
 	}
 }
 
@@ -2293,16 +2327,16 @@ static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
 	if (!tbl)
 		return -ENOMEM;
 
+	tbl->it_ops = &pnv_ioda2_iommu_ops;
+
 	ret = pnv_pci_ioda2_table_alloc_pages(nid,
 			bus_offset, page_shift, window_size,
 			levels, tbl);
 	if (ret) {
-		iommu_free_table(tbl, "pnv");
+		iommu_tce_table_put(tbl);
 		return ret;
 	}
 
-	tbl->it_ops = &pnv_ioda2_iommu_ops;
-
 	*ptbl = tbl;
 
 	return 0;
@@ -2343,7 +2377,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
 	if (rc) {
 		pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
 				rc);
-		pnv_ioda2_table_free(tbl);
+		iommu_tce_table_put(tbl);
 		return rc;
 	}
 
@@ -2414,7 +2448,8 @@ static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
 
 		tce_table_size /= direct_table_size;
 		tce_table_size <<= 3;
-		tce_table_size = _ALIGN_UP(tce_table_size, direct_table_size);
+		tce_table_size = max_t(unsigned long,
+				tce_table_size, direct_table_size);
 	}
 
 	return bytes;
@@ -2431,7 +2466,7 @@ static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
 	pnv_pci_ioda2_unset_window(&pe->table_group, 0);
 	if (pe->pbus)
 		pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
-	pnv_ioda2_table_free(tbl);
+	iommu_tce_table_put(tbl);
 }
 
 static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
@@ -2735,9 +2770,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	if (rc)
 		return;
 
-	if (pe->flags & PNV_IODA_PE_DEV)
-		iommu_add_device(&pe->pdev->dev);
-	else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
+	if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
 		pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
 }
 
@@ -3406,7 +3439,7 @@ static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
 	}
 
 	free_pages(tbl->it_base, get_order(tbl->it_size << 3));
-	iommu_free_table(tbl, "pnv");
+	iommu_tce_table_put(tbl);
 }
 
 static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
@@ -3433,7 +3466,7 @@ static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
 	}
 
 	pnv_pci_ioda2_table_free_pages(tbl);
-	iommu_free_table(tbl, "pnv");
+	iommu_tce_table_put(tbl);
 }
 
 static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe,
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index eb835e977e33..935ccb249a8a 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -758,7 +758,7 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
 
 unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
 {
-	return *(pnv_tce(tbl, index - tbl->it_offset));
+	return be64_to_cpu(*(pnv_tce(tbl, index - tbl->it_offset)));
 }
 
 struct iommu_table *pnv_pci_table_alloc(int nid)
@@ -766,7 +766,11 @@ struct iommu_table *pnv_pci_table_alloc(int nid)
 	struct iommu_table *tbl;
 
 	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, nid);
+	if (!tbl)
+		return NULL;
+
 	INIT_LIST_HEAD_RCU(&tbl->it_group_list);
+	kref_init(&tbl->it_kref);
 
 	return tbl;
 }
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index e1d3e5526b54..18c8a2fa03b8 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -7,6 +7,9 @@
 
 struct pci_dn;
 
+/* Maximum possible number of ATSD MMIO registers per NPU */
+#define NV_NMMU_ATSD_REGS 8
+
 enum pnv_phb_type {
 	PNV_PHB_IODA1	= 0,
 	PNV_PHB_IODA2	= 1,
@@ -174,6 +177,16 @@ struct pnv_phb {
 		struct OpalIoP7IOCErrorData 	hub_diag;
 	} diag;
 
+	/* Nvlink2 data */
+	struct npu {
+		int index;
+		__be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS];
+		unsigned int mmio_atsd_count;
+
+		/* Bitmask for MMIO register usage */
+		unsigned long mmio_atsd_usage;
+	} npu;
+
 #ifdef CONFIG_CXL_BASE
 	struct cxl_afu *cxl_afu;
 #endif
@@ -229,14 +242,14 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
 
 /* Nvlink functions */
 extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
-extern void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
+extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
 extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
 extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
 		struct iommu_table *tbl);
 extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num);
 extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe);
 extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe);
-
+extern int pnv_npu2_init(struct pnv_phb *phb);
 
 /* cxl functions */
 extern bool pnv_cxl_enable_device_hook(struct pci_dev *dev);
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
index 613052232475..6dbc0a1da1f6 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -18,8 +18,6 @@ static inline void pnv_pci_shutdown(void) { }
 #endif
 
 extern u32 pnv_get_supported_cpuidle_states(void);
-extern u64 pnv_deepest_stop_psscr_val;
-extern u64 pnv_deepest_stop_psscr_mask;
 
 extern void pnv_lpc_init(void);
 
diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c
index 5dcbdea1afac..1a9d84371a4d 100644
--- a/arch/powerpc/platforms/powernv/rng.c
+++ b/arch/powerpc/platforms/powernv/rng.c
@@ -62,7 +62,7 @@ int powernv_get_random_real_mode(unsigned long *v)
 
 	rng = raw_cpu_read(powernv_rng);
 
-	*v = rng_whiten(rng, in_rm64(rng->regs_real));
+	*v = rng_whiten(rng, __raw_rm_readq(rng->regs_real));
 
 	return 1;
 }
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index d50c7d99baaf..2dc7e5fb86c3 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -32,6 +32,7 @@
 #include <asm/machdep.h>
 #include <asm/firmware.h>
 #include <asm/xics.h>
+#include <asm/xive.h>
 #include <asm/opal.h>
 #include <asm/kexec.h>
 #include <asm/smp.h>
@@ -76,7 +77,9 @@ static void __init pnv_init(void)
 
 static void __init pnv_init_IRQ(void)
 {
-	xics_init();
+	/* Try using a XIVE if available, otherwise use a XICS */
+	if (!xive_native_init())
+		xics_init();
 
 	WARN_ON(!ppc_md.get_irq);
 }
@@ -95,6 +98,10 @@ static void pnv_show_cpuinfo(struct seq_file *m)
 	else
 		seq_printf(m, "firmware\t: BML\n");
 	of_node_put(root);
+	if (radix_enabled())
+		seq_printf(m, "MMU\t\t: Radix\n");
+	else
+		seq_printf(m, "MMU\t\t: Hash\n");
 }
 
 static void pnv_prepare_going_down(void)
@@ -218,10 +225,12 @@ static void pnv_kexec_wait_secondaries_down(void)
 
 static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
 {
-	xics_kexec_teardown_cpu(secondary);
+	if (xive_enabled())
+		xive_kexec_teardown_cpu(secondary);
+	else
+		xics_kexec_teardown_cpu(secondary);
 
 	/* On OPAL, we return all CPUs to firmware */
-
 	if (!firmware_has_feature(FW_FEATURE_OPAL))
 		return;
 
@@ -237,6 +246,10 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
 		/* Primary waits for the secondaries to have reached OPAL */
 		pnv_kexec_wait_secondaries_down();
 
+		/* Switch XIVE back to emulation mode */
+		if (xive_enabled())
+			xive_shutdown();
+
 		/*
 		 * We might be running as little-endian - now that interrupts
 		 * are disabled, reset the HILE bit to big-endian so we don't
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 8b67e1eefb5c..4aff754b6f2c 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -29,12 +29,14 @@
 #include <asm/vdso_datapage.h>
 #include <asm/cputhreads.h>
 #include <asm/xics.h>
+#include <asm/xive.h>
 #include <asm/opal.h>
 #include <asm/runlatch.h>
 #include <asm/code-patching.h>
 #include <asm/dbell.h>
 #include <asm/kvm_ppc.h>
 #include <asm/ppc-opcode.h>
+#include <asm/cpuidle.h>
 
 #include "powernv.h"
 
@@ -47,13 +49,10 @@
 
 static void pnv_smp_setup_cpu(int cpu)
 {
-	if (cpu != boot_cpuid)
+	if (xive_enabled())
+		xive_smp_setup_cpu();
+	else if (cpu != boot_cpuid)
 		xics_setup_cpu();
-
-#ifdef CONFIG_PPC_DOORBELL
-	if (cpu_has_feature(CPU_FTR_DBELL))
-		doorbell_setup_this_cpu();
-#endif
 }
 
 static int pnv_smp_kick_cpu(int nr)
@@ -132,7 +131,10 @@ static int pnv_smp_cpu_disable(void)
 	vdso_data->processorCount--;
 	if (cpu == boot_cpuid)
 		boot_cpuid = cpumask_any(cpu_online_mask);
-	xics_migrate_irqs_away();
+	if (xive_enabled())
+		xive_smp_disable_cpu();
+	else
+		xics_migrate_irqs_away();
 	return 0;
 }
 
@@ -140,7 +142,6 @@ static void pnv_smp_cpu_kill_self(void)
 {
 	unsigned int cpu;
 	unsigned long srr1, wmask;
-	u32 idle_states;
 
 	/* Standard hot unplug procedure */
 	local_irq_disable();
@@ -155,8 +156,6 @@ static void pnv_smp_cpu_kill_self(void)
 	if (cpu_has_feature(CPU_FTR_ARCH_207S))
 		wmask = SRR1_WAKEMASK_P8;
 
-	idle_states = pnv_get_supported_cpuidle_states();
-
 	/* We don't want to take decrementer interrupts while we are offline,
 	 * so clear LPCR:PECE1. We keep PECE2 (and LPCR_PECE_HVEE on P9)
 	 * enabled as to let IPIs in.
@@ -184,19 +183,7 @@ static void pnv_smp_cpu_kill_self(void)
 		kvmppc_set_host_ipi(cpu, 0);
 
 		ppc64_runlatch_off();
-
-		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
-			srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val,
-						pnv_deepest_stop_psscr_mask);
-		} else if (idle_states & OPAL_PM_WINKLE_ENABLED) {
-			srr1 = power7_winkle();
-		} else if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
-			   (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
-			srr1 = power7_sleep();
-		} else {
-			srr1 = power7_nap(1);
-		}
-
+		srr1 = pnv_cpu_offline(cpu);
 		ppc64_runlatch_on();
 
 		/*
@@ -213,9 +200,12 @@ static void pnv_smp_cpu_kill_self(void)
 		if (((srr1 & wmask) == SRR1_WAKEEE) ||
 		    ((srr1 & wmask) == SRR1_WAKEHVI) ||
 		    (local_paca->irq_happened & PACA_IRQ_EE)) {
-			if (cpu_has_feature(CPU_FTR_ARCH_300))
-				icp_opal_flush_interrupt();
-			else
+			if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+				if (xive_enabled())
+					xive_flush_interrupt();
+				else
+					icp_opal_flush_interrupt();
+			} else
 				icp_native_flush_interrupt();
 		} else if ((srr1 & wmask) == SRR1_WAKEHDBELL) {
 			unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
@@ -252,10 +242,69 @@ static int pnv_cpu_bootable(unsigned int nr)
 	return smp_generic_cpu_bootable(nr);
 }
 
+static int pnv_smp_prepare_cpu(int cpu)
+{
+	if (xive_enabled())
+		return xive_smp_prepare_cpu(cpu);
+	return 0;
+}
+
+/* Cause IPI as setup by the interrupt controller (xics or xive) */
+static void (*ic_cause_ipi)(int cpu);
+
+static void pnv_cause_ipi(int cpu)
+{
+	if (doorbell_try_core_ipi(cpu))
+		return;
+
+	ic_cause_ipi(cpu);
+}
+
+static void pnv_p9_dd1_cause_ipi(int cpu)
+{
+	int this_cpu = get_cpu();
+
+	/*
+	 * POWER9 DD1 has a global addressed msgsnd, but for now we restrict
+	 * IPIs to same core, because it requires additional synchronization
+	 * for inter-core doorbells which we do not implement.
+	 */
+	if (cpumask_test_cpu(cpu, cpu_sibling_mask(this_cpu)))
+		doorbell_global_ipi(cpu);
+	else
+		ic_cause_ipi(cpu);
+
+	put_cpu();
+}
+
+static void __init pnv_smp_probe(void)
+{
+	if (xive_enabled())
+		xive_smp_probe();
+	else
+		xics_smp_probe();
+
+	if (cpu_has_feature(CPU_FTR_DBELL)) {
+		ic_cause_ipi = smp_ops->cause_ipi;
+		WARN_ON(!ic_cause_ipi);
+
+		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+			if (cpu_has_feature(CPU_FTR_POWER9_DD1))
+				smp_ops->cause_ipi = pnv_p9_dd1_cause_ipi;
+			else
+				smp_ops->cause_ipi = doorbell_global_ipi;
+		} else {
+			smp_ops->cause_ipi = pnv_cause_ipi;
+		}
+	}
+}
+
 static struct smp_ops_t pnv_smp_ops = {
-	.message_pass	= smp_muxed_ipi_message_pass,
-	.cause_ipi	= NULL,	/* Filled at runtime by xics_smp_probe() */
-	.probe		= xics_smp_probe,
+	.message_pass	= NULL, /* Use smp_muxed_ipi_message_pass */
+	.cause_ipi	= NULL,	/* Filled at runtime by pnv_smp_probe() */
+	.cause_nmi_ipi	= NULL,
+	.probe		= pnv_smp_probe,
+	.prepare_cpu	= pnv_smp_prepare_cpu,
 	.kick_cpu	= pnv_smp_kick_cpu,
 	.setup_cpu	= pnv_smp_setup_cpu,
 	.cpu_bootable	= pnv_cpu_bootable,
diff --git a/arch/powerpc/platforms/ps3/smp.c b/arch/powerpc/platforms/ps3/smp.c
index 60154d08debf..1d1ad5df106f 100644
--- a/arch/powerpc/platforms/ps3/smp.c
+++ b/arch/powerpc/platforms/ps3/smp.c
@@ -77,7 +77,7 @@ static void __init ps3_smp_probe(void)
 		BUILD_BUG_ON(PPC_MSG_CALL_FUNCTION    != 0);
 		BUILD_BUG_ON(PPC_MSG_RESCHEDULE       != 1);
 		BUILD_BUG_ON(PPC_MSG_TICK_BROADCAST   != 2);
-		BUILD_BUG_ON(PPC_MSG_DEBUGGER_BREAK   != 3);
+		BUILD_BUG_ON(PPC_MSG_NMI_IPI          != 3);
 
 		for (i = 0; i < MSG_COUNT; i++) {
 			result = ps3_event_receive_port_setup(cpu, &virqs[i]);
@@ -96,7 +96,7 @@ static void __init ps3_smp_probe(void)
 				ps3_register_ipi_irq(cpu, virqs[i]);
 		}
 
-		ps3_register_ipi_debug_brk(cpu, virqs[PPC_MSG_DEBUGGER_BREAK]);
+		ps3_register_ipi_debug_brk(cpu, virqs[PPC_MSG_NMI_IPI]);
 
 		DBG(" <- %s:%d: (%d)\n", __func__, __LINE__, cpu);
 	}
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index 30ec04f1c67c..913c54e23eea 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -17,9 +17,10 @@ config PPC_PSERIES
 	select PPC_UDBG_16550
 	select PPC_NATIVE
 	select PPC_DOORBELL
-	select HOTPLUG_CPU if SMP
+	select HOTPLUG_CPU
 	select ARCH_RANDOM
 	select PPC_DOORBELL
+	select FORCE_SMP
 	default y
 
 config PPC_SPLPAR
diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
index 193e052fa0dd..bda18d8e1674 100644
--- a/arch/powerpc/platforms/pseries/dlpar.c
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -288,7 +288,6 @@ int dlpar_detach_node(struct device_node *dn)
 	if (rc)
 		return rc;
 
-	of_node_put(dn); /* Must decrement the refcount */
 	return 0;
 }
 
diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c
index 6b04e3f0f982..18014cdeb590 100644
--- a/arch/powerpc/platforms/pseries/dtl.c
+++ b/arch/powerpc/platforms/pseries/dtl.c
@@ -21,13 +21,12 @@
  */
 
 #include <linux/slab.h>
-#include <linux/debugfs.h>
 #include <linux/spinlock.h>
 #include <asm/smp.h>
 #include <linux/uaccess.h>
 #include <asm/firmware.h>
 #include <asm/lppaca.h>
-#include <asm/debug.h>
+#include <asm/debugfs.h>
 #include <asm/plpar_wrappers.h>
 #include <asm/machdep.h>
 
diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c b/arch/powerpc/platforms/pseries/hvCall_inst.c
index f02ec3ab428c..957ae347b0b3 100644
--- a/arch/powerpc/platforms/pseries/hvCall_inst.c
+++ b/arch/powerpc/platforms/pseries/hvCall_inst.c
@@ -29,6 +29,16 @@
 #include <asm/trace.h>
 #include <asm/machdep.h>
 
+/* For hcall instrumentation. One structure per-hcall, per-CPU */
+struct hcall_stats {
+	unsigned long	num_calls;	/* number of calls (on this CPU) */
+	unsigned long	tb_total;	/* total wall time (mftb) of calls. */
+	unsigned long	purr_total;	/* total cpu time (PURR) of calls. */
+	unsigned long	tb_start;
+	unsigned long	purr_start;
+};
+#define HCALL_STAT_ARRAY_SIZE	((MAX_HCALL_OPCODE >> 2) + 1)
+
 DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats);
 
 /*
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 4d757eaa46bf..8374adee27e3 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -74,6 +74,7 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 		goto fail_exit;
 
 	INIT_LIST_HEAD_RCU(&tbl->it_group_list);
+	kref_init(&tbl->it_kref);
 	tgl->table_group = table_group;
 	list_add_rcu(&tgl->next, &tbl->it_group_list);
 
@@ -115,7 +116,7 @@ static void iommu_pseries_free_group(struct iommu_table_group *table_group,
 		BUG_ON(table_group->group);
 	}
 #endif
-	iommu_free_table(tbl, node_name);
+	iommu_tce_table_put(tbl);
 
 	kfree(table_group);
 }
@@ -550,6 +551,7 @@ static void iommu_table_setparms(struct pci_controller *phb,
 static void iommu_table_setparms_lpar(struct pci_controller *phb,
 				      struct device_node *dn,
 				      struct iommu_table *tbl,
+				      struct iommu_table_group *table_group,
 				      const __be32 *dma_window)
 {
 	unsigned long offset, size;
@@ -563,6 +565,9 @@ static void iommu_table_setparms_lpar(struct pci_controller *phb,
 	tbl->it_type = TCE_PCI;
 	tbl->it_offset = offset >> tbl->it_page_shift;
 	tbl->it_size = size >> tbl->it_page_shift;
+
+	table_group->tce32_start = offset;
+	table_group->tce32_size = size;
 }
 
 struct iommu_table_ops iommu_table_pseries_ops = {
@@ -651,8 +656,38 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 	pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size);
 }
 
+#ifdef CONFIG_IOMMU_API
+static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned
+				long *tce, enum dma_data_direction *direction)
+{
+	long rc;
+	unsigned long ioba = (unsigned long) index << tbl->it_page_shift;
+	unsigned long flags, oldtce = 0;
+	u64 proto_tce = iommu_direction_to_tce_perm(*direction);
+	unsigned long newtce = *tce | proto_tce;
+
+	spin_lock_irqsave(&tbl->large_pool.lock, flags);
+
+	rc = plpar_tce_get((u64)tbl->it_index, ioba, &oldtce);
+	if (!rc)
+		rc = plpar_tce_put((u64)tbl->it_index, ioba, newtce);
+
+	if (!rc) {
+		*direction = iommu_tce_direction(oldtce);
+		*tce = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
+	}
+
+	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
+
+	return rc;
+}
+#endif
+
 struct iommu_table_ops iommu_table_lpar_multi_ops = {
 	.set = tce_buildmulti_pSeriesLP,
+#ifdef CONFIG_IOMMU_API
+	.exchange = tce_exchange_pseries,
+#endif
 	.clear = tce_freemulti_pSeriesLP,
 	.get = tce_get_pSeriesLP
 };
@@ -689,7 +724,8 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 	if (!ppci->table_group) {
 		ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);
 		tbl = ppci->table_group->tables[0];
-		iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
+		iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
+				ppci->table_group, dma_window);
 		tbl->it_ops = &iommu_table_lpar_multi_ops;
 		iommu_init_table(tbl, ppci->phb->node);
 		iommu_register_group(ppci->table_group,
@@ -1143,7 +1179,8 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 	if (!pci->table_group) {
 		pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
 		tbl = pci->table_group->tables[0];
-		iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
+		iommu_table_setparms_lpar(pci->phb, pdn, tbl,
+				pci->table_group, dma_window);
 		tbl->it_ops = &iommu_table_lpar_multi_ops;
 		iommu_init_table(tbl, pci->phb->node);
 		iommu_register_group(pci->table_group,
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 8b1fe895daa3..6541d0b03e4c 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -958,3 +958,64 @@ int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data)
 
 	return rc;
 }
+
+static unsigned long vsid_unscramble(unsigned long vsid, int ssize)
+{
+	unsigned long protovsid;
+	unsigned long va_bits = VA_BITS;
+	unsigned long modinv, vsid_modulus;
+	unsigned long max_mod_inv, tmp_modinv;
+
+	if (!mmu_has_feature(MMU_FTR_68_BIT_VA))
+		va_bits = 65;
+
+	if (ssize == MMU_SEGSIZE_256M) {
+		modinv = VSID_MULINV_256M;
+		vsid_modulus = ((1UL << (va_bits - SID_SHIFT)) - 1);
+	} else {
+		modinv = VSID_MULINV_1T;
+		vsid_modulus = ((1UL << (va_bits - SID_SHIFT_1T)) - 1);
+	}
+
+	/*
+	 * vsid outside our range.
+	 */
+	if (vsid >= vsid_modulus)
+		return 0;
+
+	/*
+	 * If modinv is the modular multiplicate inverse of (x % vsid_modulus)
+	 * and vsid = (protovsid * x) % vsid_modulus, then we say:
+	 *   protovsid = (vsid * modinv) % vsid_modulus
+	 */
+
+	/* Check if (vsid * modinv) overflow (63 bits) */
+	max_mod_inv = 0x7fffffffffffffffull / vsid;
+	if (modinv < max_mod_inv)
+		return (vsid * modinv) % vsid_modulus;
+
+	tmp_modinv = modinv/max_mod_inv;
+	modinv %= max_mod_inv;
+
+	protovsid = (((vsid * max_mod_inv) % vsid_modulus) * tmp_modinv) % vsid_modulus;
+	protovsid = (protovsid + vsid * modinv) % vsid_modulus;
+
+	return protovsid;
+}
+
+static int __init reserve_vrma_context_id(void)
+{
+	unsigned long protovsid;
+
+	/*
+	 * Reserve context ids which map to reserved virtual addresses. For now
+	 * we only reserve the context id which maps to the VRMA VSID. We ignore
+	 * the addresses in "ibm,adjunct-virtual-addresses" because we don't
+	 * enable adjunct support via the "ibm,client-architecture-support"
+	 * interface.
+	 */
+	protovsid = vsid_unscramble(VRMA_VSID, MMU_SEGSIZE_1T);
+	hash__reserve_context_id(protovsid >> ESID_BITS_1T);
+	return 0;
+}
+machine_device_initcall(pseries, reserve_vrma_context_id);
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 904a677208d1..bb70b26334f0 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -386,6 +386,10 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
 		}
 		fwnmi_release_errinfo();
 	}
+
+	if (smp_handle_nmi_ipi(regs))
+		return 1;
+
 	return 0; /* need to perform reset */
 }
 
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index b4d362ed03a1..b5d86426e97b 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -87,6 +87,10 @@ static void pSeries_show_cpuinfo(struct seq_file *m)
 		model = of_get_property(root, "model", NULL);
 	seq_printf(m, "machine\t\t: CHRP %s\n", model);
 	of_node_put(root);
+	if (radix_enabled())
+		seq_printf(m, "MMU\t\t: Radix\n");
+	else
+		seq_printf(m, "MMU\t\t: Hash\n");
 }
 
 /* Initialize firmware assisted non-maskable interrupts if
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
index f6f83aeccaaa..52ca6b311d44 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -55,11 +55,6 @@
  */
 static cpumask_var_t of_spin_mask;
 
-/*
- * If we multiplex IPI mechanisms, store the appropriate XICS IPI mechanism here
- */
-static void  (*xics_cause_ipi)(int cpu, unsigned long data);
-
 /* Query where a cpu is now.  Return codes #defined in plpar_wrappers.h */
 int smp_query_cpu_stopped(unsigned int pcpu)
 {
@@ -143,8 +138,6 @@ static void smp_setup_cpu(int cpu)
 {
 	if (cpu != boot_cpuid)
 		xics_setup_cpu();
-	if (cpu_has_feature(CPU_FTR_DBELL))
-		doorbell_setup_this_cpu();
 
 	if (firmware_has_feature(FW_FEATURE_SPLPAR))
 		vpa_init(cpu);
@@ -187,28 +180,50 @@ static int smp_pSeries_kick_cpu(int nr)
 	return 0;
 }
 
-/* Only used on systems that support multiple IPI mechanisms */
-static void pSeries_cause_ipi_mux(int cpu, unsigned long data)
+static void smp_pseries_cause_ipi(int cpu)
 {
-	if (cpumask_test_cpu(cpu, cpu_sibling_mask(smp_processor_id())))
-		doorbell_cause_ipi(cpu, data);
-	else
-		xics_cause_ipi(cpu, data);
+	/* POWER9 should not use this handler */
+	if (doorbell_try_core_ipi(cpu))
+		return;
+
+	icp_ops->cause_ipi(cpu);
+}
+
+static int pseries_cause_nmi_ipi(int cpu)
+{
+	int hwcpu;
+
+	if (cpu == NMI_IPI_ALL_OTHERS) {
+		hwcpu = H_SIGNAL_SYS_RESET_ALL_OTHERS;
+	} else {
+		if (cpu < 0) {
+			WARN_ONCE(true, "incorrect cpu parameter %d", cpu);
+			return 0;
+		}
+
+		hwcpu = get_hard_smp_processor_id(cpu);
+	}
+
+	if (plapr_signal_sys_reset(hwcpu) == H_SUCCESS)
+		return 1;
+
+	return 0;
 }
 
 static __init void pSeries_smp_probe(void)
 {
 	xics_smp_probe();
 
-	if (cpu_has_feature(CPU_FTR_DBELL)) {
-		xics_cause_ipi = smp_ops->cause_ipi;
-		smp_ops->cause_ipi = pSeries_cause_ipi_mux;
-	}
+	if (cpu_has_feature(CPU_FTR_DBELL))
+		smp_ops->cause_ipi = smp_pseries_cause_ipi;
+	else
+		smp_ops->cause_ipi = icp_ops->cause_ipi;
 }
 
 static struct smp_ops_t pseries_smp_ops = {
 	.message_pass	= NULL,	/* Use smp_muxed_ipi_message_pass */
 	.cause_ipi	= NULL,	/* Filled at runtime by pSeries_smp_probe() */
+	.cause_nmi_ipi	= pseries_cause_nmi_ipi,
 	.probe		= pSeries_smp_probe,
 	.kick_cpu	= smp_pSeries_kick_cpu,
 	.setup_cpu	= smp_setup_cpu,
diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c
index 720493932486..28b09fd797ec 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -1318,7 +1318,7 @@ static void vio_dev_release(struct device *dev)
 	struct iommu_table *tbl = get_iommu_table_base(dev);
 
 	if (tbl)
-		iommu_free_table(tbl, of_node_full_name(dev->of_node));
+		iommu_tce_table_put(tbl);
 	of_node_put(dev->of_node);
 	kfree(to_vio_dev(dev));
 }
diff --git a/arch/powerpc/sysdev/Kconfig b/arch/powerpc/sysdev/Kconfig
index 52dc165c0efb..caf882e749dc 100644
--- a/arch/powerpc/sysdev/Kconfig
+++ b/arch/powerpc/sysdev/Kconfig
@@ -28,6 +28,7 @@ config PPC_MSI_BITMAP
 	default y if PPC_POWERNV
 
 source "arch/powerpc/sysdev/xics/Kconfig"
+source "arch/powerpc/sysdev/xive/Kconfig"
 
 config PPC_SCOM
 	bool
diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile
index a254824719f1..c0ae11d4f62f 100644
--- a/arch/powerpc/sysdev/Makefile
+++ b/arch/powerpc/sysdev/Makefile
@@ -71,5 +71,6 @@ obj-$(CONFIG_PPC_EARLY_DEBUG_MEMCONS)	+= udbg_memcons.o
 subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
 
 obj-$(CONFIG_PPC_XICS)		+= xics/
+obj-$(CONFIG_PPC_XIVE)		+= xive/
 
 obj-$(CONFIG_GE_FPGA)		+= ge/
diff --git a/arch/powerpc/sysdev/scom.c b/arch/powerpc/sysdev/scom.c
index d0e9f178a324..76ea32c1b664 100644
--- a/arch/powerpc/sysdev/scom.c
+++ b/arch/powerpc/sysdev/scom.c
@@ -19,10 +19,9 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/debugfs.h>
 #include <linux/slab.h>
 #include <linux/export.h>
-#include <asm/debug.h>
+#include <asm/debugfs.h>
 #include <asm/prom.h>
 #include <asm/scom.h>
 #include <linux/uaccess.h>
diff --git a/arch/powerpc/sysdev/xics/icp-hv.c b/arch/powerpc/sysdev/xics/icp-hv.c
index e7fa26c4ff73..bbc839a98c41 100644
--- a/arch/powerpc/sysdev/xics/icp-hv.c
+++ b/arch/powerpc/sysdev/xics/icp-hv.c
@@ -138,7 +138,7 @@ static void icp_hv_set_cpu_priority(unsigned char cppr)
 
 #ifdef CONFIG_SMP
 
-static void icp_hv_cause_ipi(int cpu, unsigned long data)
+static void icp_hv_cause_ipi(int cpu)
 {
 	icp_hv_set_qirr(cpu, IPI_PRIORITY);
 }
diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c
index 8a6a043e239b..2bfb9968d562 100644
--- a/arch/powerpc/sysdev/xics/icp-native.c
+++ b/arch/powerpc/sysdev/xics/icp-native.c
@@ -143,19 +143,9 @@ static unsigned int icp_native_get_irq(void)
 
 #ifdef CONFIG_SMP
 
-static void icp_native_cause_ipi(int cpu, unsigned long data)
+static void icp_native_cause_ipi(int cpu)
 {
 	kvmppc_set_host_ipi(cpu, 1);
-#ifdef CONFIG_PPC_DOORBELL
-	if (cpu_has_feature(CPU_FTR_DBELL)) {
-		if (cpumask_test_cpu(cpu, cpu_sibling_mask(get_cpu()))) {
-			doorbell_cause_ipi(cpu, data);
-			put_cpu();
-			return;
-		}
-		put_cpu();
-	}
-#endif
 	icp_native_set_qirr(cpu, IPI_PRIORITY);
 }
 
@@ -168,15 +158,15 @@ void icp_native_cause_ipi_rm(int cpu)
 	 * Need the physical address of the XICS to be
 	 * previously saved in kvm_hstate in the paca.
 	 */
-	unsigned long xics_phys;
+	void __iomem *xics_phys;
 
 	/*
 	 * Just like the cause_ipi functions, it is required to
-	 * include a full barrier (out8 includes a sync) before
-	 * causing the IPI.
+	 * include a full barrier before causing the IPI.
 	 */
 	xics_phys = paca[cpu].kvm_hstate.xics_phys;
-	out_rm8((u8 *)(xics_phys + XICS_MFRR), IPI_PRIORITY);
+	mb();
+	__raw_rm_writeb(IPI_PRIORITY, xics_phys + XICS_MFRR);
 }
 #endif
 
diff --git a/arch/powerpc/sysdev/xics/icp-opal.c b/arch/powerpc/sysdev/xics/icp-opal.c
index b53f80f0b4d8..c71d2ea42627 100644
--- a/arch/powerpc/sysdev/xics/icp-opal.c
+++ b/arch/powerpc/sysdev/xics/icp-opal.c
@@ -126,7 +126,7 @@ static void icp_opal_eoi(struct irq_data *d)
 
 #ifdef CONFIG_SMP
 
-static void icp_opal_cause_ipi(int cpu, unsigned long data)
+static void icp_opal_cause_ipi(int cpu)
 {
 	int hw_cpu = get_hard_smp_processor_id(cpu);
 
diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c
index 23efe4e42172..ffe138b8b9dc 100644
--- a/arch/powerpc/sysdev/xics/xics-common.c
+++ b/arch/powerpc/sysdev/xics/xics-common.c
@@ -143,11 +143,11 @@ static void xics_request_ipi(void)
 
 void __init xics_smp_probe(void)
 {
-	/* Setup cause_ipi callback  based on which ICP is used */
-	smp_ops->cause_ipi = icp_ops->cause_ipi;
-
 	/* Register all the IPIs */
 	xics_request_ipi();
+
+	/* Setup cause_ipi callback based on which ICP is used */
+	smp_ops->cause_ipi = icp_ops->cause_ipi;
 }
 
 #endif /* CONFIG_SMP */
diff --git a/arch/powerpc/sysdev/xive/Kconfig b/arch/powerpc/sysdev/xive/Kconfig
new file mode 100644
index 000000000000..12ccd7373d2f
--- /dev/null
+++ b/arch/powerpc/sysdev/xive/Kconfig
@@ -0,0 +1,11 @@
+config PPC_XIVE
+	bool
+	default n
+	select PPC_SMP_MUXED_IPI
+	select HARDIRQS_SW_RESEND
+
+config PPC_XIVE_NATIVE
+	bool
+	default n
+	select PPC_XIVE
+	depends on PPC_POWERNV
diff --git a/arch/powerpc/sysdev/xive/Makefile b/arch/powerpc/sysdev/xive/Makefile
new file mode 100644
index 000000000000..3fab303fc169
--- /dev/null
+++ b/arch/powerpc/sysdev/xive/Makefile
@@ -0,0 +1,4 @@
+subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
+
+obj-y				+= common.o
+obj-$(CONFIG_PPC_XIVE_NATIVE)	+= native.o
diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
new file mode 100644
index 000000000000..6a98efb14264
--- /dev/null
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -0,0 +1,1302 @@
+/*
+ * Copyright 2016,2017 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "xive: " fmt
+
+#include <linux/types.h>
+#include <linux/threads.h>
+#include <linux/kernel.h>
+#include <linux/irq.h>
+#include <linux/debugfs.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/msi.h>
+
+#include <asm/prom.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/machdep.h>
+#include <asm/irq.h>
+#include <asm/errno.h>
+#include <asm/xive.h>
+#include <asm/xive-regs.h>
+#include <asm/xmon.h>
+
+#include "xive-internal.h"
+
+#undef DEBUG_FLUSH
+#undef DEBUG_ALL
+
+#ifdef DEBUG_ALL
+#define DBG_VERBOSE(fmt...)	pr_devel(fmt)
+#else
+#define DBG_VERBOSE(fmt...)	do { } while(0)
+#endif
+
+bool __xive_enabled;
+bool xive_cmdline_disabled;
+
+/* We use only one priority for now */
+static u8 xive_irq_priority;
+
+/* TIMA */
+void __iomem *xive_tima;
+u32 xive_tima_offset;
+
+/* Backend ops */
+static const struct xive_ops *xive_ops;
+
+/* Our global interrupt domain */
+static struct irq_domain *xive_irq_domain;
+
+#ifdef CONFIG_SMP
+/* The IPIs all use the same logical irq number */
+static u32 xive_ipi_irq;
+#endif
+
+/* Xive state for each CPU */
+static DEFINE_PER_CPU(struct xive_cpu *, xive_cpu);
+
+/*
+ * A "disabled" interrupt should never fire, to catch problems
+ * we set its logical number to this
+ */
+#define XIVE_BAD_IRQ		0x7fffffff
+#define XIVE_MAX_IRQ		(XIVE_BAD_IRQ - 1)
+
+/* An invalid CPU target */
+#define XIVE_INVALID_TARGET	(-1)
+
+/*
+ * Read the next entry in a queue, return its content if it's valid
+ * or 0 if there is no new entry.
+ *
+ * The queue pointer is moved forward unless "just_peek" is set
+ */
+static u32 xive_read_eq(struct xive_q *q, bool just_peek)
+{
+	u32 cur;
+
+	if (!q->qpage)
+		return 0;
+	cur = be32_to_cpup(q->qpage + q->idx);
+
+	/* Check valid bit (31) vs current toggle polarity */
+	if ((cur >> 31) == q->toggle)
+		return 0;
+
+	/* If consuming from the queue ... */
+	if (!just_peek) {
+		/* Next entry */
+		q->idx = (q->idx + 1) & q->msk;
+
+		/* Wrap around: flip valid toggle */
+		if (q->idx == 0)
+			q->toggle ^= 1;
+	}
+	/* Mask out the valid bit (31) */
+	return cur & 0x7fffffff;
+}
+
+/*
+ * Scans all the queue that may have interrupts in them
+ * (based on "pending_prio") in priority order until an
+ * interrupt is found or all the queues are empty.
+ *
+ * Then updates the CPPR (Current Processor Priority
+ * Register) based on the most favored interrupt found
+ * (0xff if none) and return what was found (0 if none).
+ *
+ * If just_peek is set, return the most favored pending
+ * interrupt if any but don't update the queue pointers.
+ *
+ * Note: This function can operate generically on any number
+ * of queues (up to 8). The current implementation of the XIVE
+ * driver only uses a single queue however.
+ *
+ * Note2: This will also "flush" "the pending_count" of a queue
+ * into the "count" when that queue is observed to be empty.
+ * This is used to keep track of the amount of interrupts
+ * targetting a queue. When an interrupt is moved away from
+ * a queue, we only decrement that queue count once the queue
+ * has been observed empty to avoid races.
+ */
+static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek)
+{
+	u32 irq = 0;
+	u8 prio;
+
+	/* Find highest pending priority */
+	while (xc->pending_prio != 0) {
+		struct xive_q *q;
+
+		prio = ffs(xc->pending_prio) - 1;
+		DBG_VERBOSE("scan_irq: trying prio %d\n", prio);
+
+		/* Try to fetch */
+		irq = xive_read_eq(&xc->queue[prio], just_peek);
+
+		/* Found something ? That's it */
+		if (irq)
+			break;
+
+		/* Clear pending bits */
+		xc->pending_prio &= ~(1 << prio);
+
+		/*
+		 * Check if the queue count needs adjusting due to
+		 * interrupts being moved away. See description of
+		 * xive_dec_target_count()
+		 */
+		q = &xc->queue[prio];
+		if (atomic_read(&q->pending_count)) {
+			int p = atomic_xchg(&q->pending_count, 0);
+			if (p) {
+				WARN_ON(p > atomic_read(&q->count));
+				atomic_sub(p, &q->count);
+			}
+		}
+	}
+
+	/* If nothing was found, set CPPR to 0xff */
+	if (irq == 0)
+		prio = 0xff;
+
+	/* Update HW CPPR to match if necessary */
+	if (prio != xc->cppr) {
+		DBG_VERBOSE("scan_irq: adjusting CPPR to %d\n", prio);
+		xc->cppr = prio;
+		out_8(xive_tima + xive_tima_offset + TM_CPPR, prio);
+	}
+
+	return irq;
+}
+
+/*
+ * This is used to perform the magic loads from an ESB
+ * described in xive.h
+ */
+static u8 xive_poke_esb(struct xive_irq_data *xd, u32 offset)
+{
+	u64 val;
+
+	/* Handle HW errata */
+	if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
+		offset |= offset << 4;
+
+	val = in_be64(xd->eoi_mmio + offset);
+
+	return (u8)val;
+}
+
+#ifdef CONFIG_XMON
+static void xive_dump_eq(const char *name, struct xive_q *q)
+{
+	u32 i0, i1, idx;
+
+	if (!q->qpage)
+		return;
+	idx = q->idx;
+	i0 = be32_to_cpup(q->qpage + idx);
+	idx = (idx + 1) & q->msk;
+	i1 = be32_to_cpup(q->qpage + idx);
+	xmon_printf("  %s Q T=%d %08x %08x ...\n", name,
+		    q->toggle, i0, i1);
+}
+
+void xmon_xive_do_dump(int cpu)
+{
+	struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
+
+	xmon_printf("XIVE state for CPU %d:\n", cpu);
+	xmon_printf("  pp=%02x cppr=%02x\n", xc->pending_prio, xc->cppr);
+	xive_dump_eq("IRQ", &xc->queue[xive_irq_priority]);
+#ifdef CONFIG_SMP
+	{
+		u64 val = xive_poke_esb(&xc->ipi_data, XIVE_ESB_GET);
+		xmon_printf("  IPI state: %x:%c%c\n", xc->hw_ipi,
+			val & XIVE_ESB_VAL_P ? 'P' : 'p',
+			val & XIVE_ESB_VAL_P ? 'Q' : 'q');
+	}
+#endif
+}
+#endif /* CONFIG_XMON */
+
+static unsigned int xive_get_irq(void)
+{
+	struct xive_cpu *xc = __this_cpu_read(xive_cpu);
+	u32 irq;
+
+	/*
+	 * This can be called either as a result of a HW interrupt or
+	 * as a "replay" because EOI decided there was still something
+	 * in one of the queues.
+	 *
+	 * First we perform an ACK cycle in order to update our mask
+	 * of pending priorities. This will also have the effect of
+	 * updating the CPPR to the most favored pending interrupts.
+	 *
+	 * In the future, if we have a way to differenciate a first
+	 * entry (on HW interrupt) from a replay triggered by EOI,
+	 * we could skip this on replays unless we soft-mask tells us
+	 * that a new HW interrupt occurred.
+	 */
+	xive_ops->update_pending(xc);
+
+	DBG_VERBOSE("get_irq: pending=%02x\n", xc->pending_prio);
+
+	/* Scan our queue(s) for interrupts */
+	irq = xive_scan_interrupts(xc, false);
+
+	DBG_VERBOSE("get_irq: got irq 0x%x, new pending=0x%02x\n",
+	    irq, xc->pending_prio);
+
+	/* Return pending interrupt if any */
+	if (irq == XIVE_BAD_IRQ)
+		return 0;
+	return irq;
+}
+
+/*
+ * After EOI'ing an interrupt, we need to re-check the queue
+ * to see if another interrupt is pending since multiple
+ * interrupts can coalesce into a single notification to the
+ * CPU.
+ *
+ * If we find that there is indeed more in there, we call
+ * force_external_irq_replay() to make Linux synthetize an
+ * external interrupt on the next call to local_irq_restore().
+ */
+static void xive_do_queue_eoi(struct xive_cpu *xc)
+{
+	if (xive_scan_interrupts(xc, true) != 0) {
+		DBG_VERBOSE("eoi: pending=0x%02x\n", xc->pending_prio);
+		force_external_irq_replay();
+	}
+}
+
+/*
+ * EOI an interrupt at the source. There are several methods
+ * to do this depending on the HW version and source type
+ */
+void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd)
+{
+	/* If the XIVE supports the new "store EOI facility, use it */
+	if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
+		out_be64(xd->eoi_mmio, 0);
+	else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) {
+		/*
+		 * The FW told us to call it. This happens for some
+		 * interrupt sources that need additional HW whacking
+		 * beyond the ESB manipulation. For example LPC interrupts
+		 * on P9 DD1.0 need a latch to be clared in the LPC bridge
+		 * itself. The Firmware will take care of it.
+		 */
+		if (WARN_ON_ONCE(!xive_ops->eoi))
+			return;
+		xive_ops->eoi(hw_irq);
+	} else {
+		u8 eoi_val;
+
+		/*
+		 * Otherwise for EOI, we use the special MMIO that does
+		 * a clear of both P and Q and returns the old Q,
+		 * except for LSIs where we use the "EOI cycle" special
+		 * load.
+		 *
+		 * This allows us to then do a re-trigger if Q was set
+		 * rather than synthesizing an interrupt in software
+		 *
+		 * For LSIs, using the HW EOI cycle works around a problem
+		 * on P9 DD1 PHBs where the other ESB accesses don't work
+		 * properly.
+		 */
+		if (xd->flags & XIVE_IRQ_FLAG_LSI)
+			in_be64(xd->eoi_mmio);
+		else {
+			eoi_val = xive_poke_esb(xd, XIVE_ESB_SET_PQ_00);
+			DBG_VERBOSE("eoi_val=%x\n", offset, eoi_val);
+
+			/* Re-trigger if needed */
+			if ((eoi_val & XIVE_ESB_VAL_Q) && xd->trig_mmio)
+				out_be64(xd->trig_mmio, 0);
+		}
+	}
+}
+
+/* irq_chip eoi callback */
+static void xive_irq_eoi(struct irq_data *d)
+{
+	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	struct xive_cpu *xc = __this_cpu_read(xive_cpu);
+
+	DBG_VERBOSE("eoi_irq: irq=%d [0x%lx] pending=%02x\n",
+		    d->irq, irqd_to_hwirq(d), xc->pending_prio);
+
+	/* EOI the source if it hasn't been disabled */
+	if (!irqd_irq_disabled(d))
+		xive_do_source_eoi(irqd_to_hwirq(d), xd);
+
+	/*
+	 * Clear saved_p to indicate that it's no longer occupying
+	 * a queue slot on the target queue
+	 */
+	xd->saved_p = false;
+
+	/* Check for more work in the queue */
+	xive_do_queue_eoi(xc);
+}
+
+/*
+ * Helper used to mask and unmask an interrupt source. This
+ * is only called for normal interrupts that do not require
+ * masking/unmasking via firmware.
+ */
+static void xive_do_source_set_mask(struct xive_irq_data *xd,
+				    bool mask)
+{
+	u64 val;
+
+	/*
+	 * If the interrupt had P set, it may be in a queue.
+	 *
+	 * We need to make sure we don't re-enable it until it
+	 * has been fetched from that queue and EOId. We keep
+	 * a copy of that P state and use it to restore the
+	 * ESB accordingly on unmask.
+	 */
+	if (mask) {
+		val = xive_poke_esb(xd, XIVE_ESB_SET_PQ_01);
+		xd->saved_p = !!(val & XIVE_ESB_VAL_P);
+	} else if (xd->saved_p)
+		xive_poke_esb(xd, XIVE_ESB_SET_PQ_10);
+	else
+		xive_poke_esb(xd, XIVE_ESB_SET_PQ_00);
+}
+
+/*
+ * Try to chose "cpu" as a new interrupt target. Increments
+ * the queue accounting for that target if it's not already
+ * full.
+ */
+static bool xive_try_pick_target(int cpu)
+{
+	struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
+	struct xive_q *q = &xc->queue[xive_irq_priority];
+	int max;
+
+	/*
+	 * Calculate max number of interrupts in that queue.
+	 *
+	 * We leave a gap of 1 just in case...
+	 */
+	max = (q->msk + 1) - 1;
+	return !!atomic_add_unless(&q->count, 1, max);
+}
+
+/*
+ * Un-account an interrupt for a target CPU. We don't directly
+ * decrement q->count since the interrupt might still be present
+ * in the queue.
+ *
+ * Instead increment a separate counter "pending_count" which
+ * will be substracted from "count" later when that CPU observes
+ * the queue to be empty.
+ */
+static void xive_dec_target_count(int cpu)
+{
+	struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
+	struct xive_q *q = &xc->queue[xive_irq_priority];
+
+	if (unlikely(WARN_ON(cpu < 0 || !xc))) {
+		pr_err("%s: cpu=%d xc=%p\n", __func__, cpu, xc);
+		return;
+	}
+
+	/*
+	 * We increment the "pending count" which will be used
+	 * to decrement the target queue count whenever it's next
+	 * processed and found empty. This ensure that we don't
+	 * decrement while we still have the interrupt there
+	 * occupying a slot.
+	 */
+	atomic_inc(&q->pending_count);
+}
+
+/* Find a tentative CPU target in a CPU mask */
+static int xive_find_target_in_mask(const struct cpumask *mask,
+				    unsigned int fuzz)
+{
+	int cpu, first, num, i;
+
+	/* Pick up a starting point CPU in the mask based on  fuzz */
+	num = cpumask_weight(mask);
+	first = fuzz % num;
+
+	/* Locate it */
+	cpu = cpumask_first(mask);
+	for (i = 0; i < first && cpu < nr_cpu_ids; i++)
+		cpu = cpumask_next(cpu, mask);
+
+	/* Sanity check */
+	if (WARN_ON(cpu >= nr_cpu_ids))
+		cpu = cpumask_first(cpu_online_mask);
+
+	/* Remember first one to handle wrap-around */
+	first = cpu;
+
+	/*
+	 * Now go through the entire mask until we find a valid
+	 * target.
+	 */
+	for (;;) {
+		/*
+		 * We re-check online as the fallback case passes us
+		 * an untested affinity mask
+		 */
+		if (cpu_online(cpu) && xive_try_pick_target(cpu))
+			return cpu;
+		cpu = cpumask_next(cpu, mask);
+		if (cpu == first)
+			break;
+		/* Wrap around */
+		if (cpu >= nr_cpu_ids)
+			cpu = cpumask_first(mask);
+	}
+	return -1;
+}
+
+/*
+ * Pick a target CPU for an interrupt. This is done at
+ * startup or if the affinity is changed in a way that
+ * invalidates the current target.
+ */
+static int xive_pick_irq_target(struct irq_data *d,
+				const struct cpumask *affinity)
+{
+	static unsigned int fuzz;
+	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	cpumask_var_t mask;
+	int cpu = -1;
+
+	/*
+	 * If we have chip IDs, first we try to build a mask of
+	 * CPUs matching the CPU and find a target in there
+	 */
+	if (xd->src_chip != XIVE_INVALID_CHIP_ID &&
+		zalloc_cpumask_var(&mask, GFP_ATOMIC)) {
+		/* Build a mask of matching chip IDs */
+		for_each_cpu_and(cpu, affinity, cpu_online_mask) {
+			struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
+			if (xc->chip_id == xd->src_chip)
+				cpumask_set_cpu(cpu, mask);
+		}
+		/* Try to find a target */
+		if (cpumask_empty(mask))
+			cpu = -1;
+		else
+			cpu = xive_find_target_in_mask(mask, fuzz++);
+		free_cpumask_var(mask);
+		if (cpu >= 0)
+			return cpu;
+		fuzz--;
+	}
+
+	/* No chip IDs, fallback to using the affinity mask */
+	return xive_find_target_in_mask(affinity, fuzz++);
+}
+
+static unsigned int xive_irq_startup(struct irq_data *d)
+{
+	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+	int target, rc;
+
+	pr_devel("xive_irq_startup: irq %d [0x%x] data @%p\n",
+		 d->irq, hw_irq, d);
+
+#ifdef CONFIG_PCI_MSI
+	/*
+	 * The generic MSI code returns with the interrupt disabled on the
+	 * card, using the MSI mask bits. Firmware doesn't appear to unmask
+	 * at that level, so we do it here by hand.
+	 */
+	if (irq_data_get_msi_desc(d))
+		pci_msi_unmask_irq(d);
+#endif
+
+	/* Pick a target */
+	target = xive_pick_irq_target(d, irq_data_get_affinity_mask(d));
+	if (target == XIVE_INVALID_TARGET) {
+		/* Try again breaking affinity */
+		target = xive_pick_irq_target(d, cpu_online_mask);
+		if (target == XIVE_INVALID_TARGET)
+			return -ENXIO;
+		pr_warn("irq %d started with broken affinity\n", d->irq);
+	}
+
+	/* Sanity check */
+	if (WARN_ON(target == XIVE_INVALID_TARGET ||
+		    target >= nr_cpu_ids))
+		target = smp_processor_id();
+
+	xd->target = target;
+
+	/*
+	 * Configure the logical number to be the Linux IRQ number
+	 * and set the target queue
+	 */
+	rc = xive_ops->configure_irq(hw_irq,
+				     get_hard_smp_processor_id(target),
+				     xive_irq_priority, d->irq);
+	if (rc)
+		return rc;
+
+	/* Unmask the ESB */
+	xive_do_source_set_mask(xd, false);
+
+	return 0;
+}
+
+static void xive_irq_shutdown(struct irq_data *d)
+{
+	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+
+	pr_devel("xive_irq_shutdown: irq %d [0x%x] data @%p\n",
+		 d->irq, hw_irq, d);
+
+	if (WARN_ON(xd->target == XIVE_INVALID_TARGET))
+		return;
+
+	/* Mask the interrupt at the source */
+	xive_do_source_set_mask(xd, true);
+
+	/*
+	 * The above may have set saved_p. We clear it otherwise it
+	 * will prevent re-enabling later on. It is ok to forget the
+	 * fact that the interrupt might be in a queue because we are
+	 * accounting that already in xive_dec_target_count() and will
+	 * be re-routing it to a new queue with proper accounting when
+	 * it's started up again
+	 */
+	xd->saved_p = false;
+
+	/*
+	 * Mask the interrupt in HW in the IVT/EAS and set the number
+	 * to be the "bad" IRQ number
+	 */
+	xive_ops->configure_irq(hw_irq,
+				get_hard_smp_processor_id(xd->target),
+				0xff, XIVE_BAD_IRQ);
+
+	xive_dec_target_count(xd->target);
+	xd->target = XIVE_INVALID_TARGET;
+}
+
+static void xive_irq_unmask(struct irq_data *d)
+{
+	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+
+	pr_devel("xive_irq_unmask: irq %d data @%p\n", d->irq, xd);
+
+	/*
+	 * This is a workaround for PCI LSI problems on P9, for
+	 * these, we call FW to set the mask. The problems might
+	 * be fixed by P9 DD2.0, if that is the case, firmware
+	 * will no longer set that flag.
+	 */
+	if (xd->flags & XIVE_IRQ_FLAG_MASK_FW) {
+		unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+		xive_ops->configure_irq(hw_irq,
+					get_hard_smp_processor_id(xd->target),
+					xive_irq_priority, d->irq);
+		return;
+	}
+
+	xive_do_source_set_mask(xd, false);
+}
+
+static void xive_irq_mask(struct irq_data *d)
+{
+	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+
+	pr_devel("xive_irq_mask: irq %d data @%p\n", d->irq, xd);
+
+	/*
+	 * This is a workaround for PCI LSI problems on P9, for
+	 * these, we call OPAL to set the mask. The problems might
+	 * be fixed by P9 DD2.0, if that is the case, firmware
+	 * will no longer set that flag.
+	 */
+	if (xd->flags & XIVE_IRQ_FLAG_MASK_FW) {
+		unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+		xive_ops->configure_irq(hw_irq,
+					get_hard_smp_processor_id(xd->target),
+					0xff, d->irq);
+		return;
+	}
+
+	xive_do_source_set_mask(xd, true);
+}
+
+static int xive_irq_set_affinity(struct irq_data *d,
+				 const struct cpumask *cpumask,
+				 bool force)
+{
+	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+	u32 target, old_target;
+	int rc = 0;
+
+	pr_devel("xive_irq_set_affinity: irq %d\n", d->irq);
+
+	/* Is this valid ? */
+	if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids)
+		return -EINVAL;
+
+	/*
+	 * If existing target is already in the new mask, and is
+	 * online then do nothing.
+	 */
+	if (xd->target != XIVE_INVALID_TARGET &&
+	    cpu_online(xd->target) &&
+	    cpumask_test_cpu(xd->target, cpumask))
+		return IRQ_SET_MASK_OK;
+
+	/* Pick a new target */
+	target = xive_pick_irq_target(d, cpumask);
+
+	/* No target found */
+	if (target == XIVE_INVALID_TARGET)
+		return -ENXIO;
+
+	/* Sanity check */
+	if (WARN_ON(target >= nr_cpu_ids))
+		target = smp_processor_id();
+
+	old_target = xd->target;
+
+	rc = xive_ops->configure_irq(hw_irq,
+				     get_hard_smp_processor_id(target),
+				     xive_irq_priority, d->irq);
+	if (rc < 0) {
+		pr_err("Error %d reconfiguring irq %d\n", rc, d->irq);
+		return rc;
+	}
+
+	pr_devel("  target: 0x%x\n", target);
+	xd->target = target;
+
+	/* Give up previous target */
+	if (old_target != XIVE_INVALID_TARGET)
+	    xive_dec_target_count(old_target);
+
+	return IRQ_SET_MASK_OK;
+}
+
+static int xive_irq_set_type(struct irq_data *d, unsigned int flow_type)
+{
+	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+
+	/*
+	 * We only support these. This has really no effect other than setting
+	 * the corresponding descriptor bits mind you but those will in turn
+	 * affect the resend function when re-enabling an edge interrupt.
+	 *
+	 * Set set the default to edge as explained in map().
+	 */
+	if (flow_type == IRQ_TYPE_DEFAULT || flow_type == IRQ_TYPE_NONE)
+		flow_type = IRQ_TYPE_EDGE_RISING;
+
+	if (flow_type != IRQ_TYPE_EDGE_RISING &&
+	    flow_type != IRQ_TYPE_LEVEL_LOW)
+		return -EINVAL;
+
+	irqd_set_trigger_type(d, flow_type);
+
+	/*
+	 * Double check it matches what the FW thinks
+	 *
+	 * NOTE: We don't know yet if the PAPR interface will provide
+	 * the LSI vs MSI information apart from the device-tree so
+	 * this check might have to move into an optional backend call
+	 * that is specific to the native backend
+	 */
+	if ((flow_type == IRQ_TYPE_LEVEL_LOW) !=
+	    !!(xd->flags & XIVE_IRQ_FLAG_LSI)) {
+		pr_warn("Interrupt %d (HW 0x%x) type mismatch, Linux says %s, FW says %s\n",
+			d->irq, (u32)irqd_to_hwirq(d),
+			(flow_type == IRQ_TYPE_LEVEL_LOW) ? "Level" : "Edge",
+			(xd->flags & XIVE_IRQ_FLAG_LSI) ? "Level" : "Edge");
+	}
+
+	return IRQ_SET_MASK_OK_NOCOPY;
+}
+
+static int xive_irq_retrigger(struct irq_data *d)
+{
+	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+
+	/* This should be only for MSIs */
+	if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI))
+		return 0;
+
+	/*
+	 * To perform a retrigger, we first set the PQ bits to
+	 * 11, then perform an EOI.
+	 */
+	xive_poke_esb(xd, XIVE_ESB_SET_PQ_11);
+
+	/*
+	 * Note: We pass "0" to the hw_irq argument in order to
+	 * avoid calling into the backend EOI code which we don't
+	 * want to do in the case of a re-trigger. Backends typically
+	 * only do EOI for LSIs anyway.
+	 */
+	xive_do_source_eoi(0, xd);
+
+	return 1;
+}
+
+static struct irq_chip xive_irq_chip = {
+	.name = "XIVE-IRQ",
+	.irq_startup = xive_irq_startup,
+	.irq_shutdown = xive_irq_shutdown,
+	.irq_eoi = xive_irq_eoi,
+	.irq_mask = xive_irq_mask,
+	.irq_unmask = xive_irq_unmask,
+	.irq_set_affinity = xive_irq_set_affinity,
+	.irq_set_type = xive_irq_set_type,
+	.irq_retrigger = xive_irq_retrigger,
+};
+
+bool is_xive_irq(struct irq_chip *chip)
+{
+	return chip == &xive_irq_chip;
+}
+
+void xive_cleanup_irq_data(struct xive_irq_data *xd)
+{
+	if (xd->eoi_mmio) {
+		iounmap(xd->eoi_mmio);
+		if (xd->eoi_mmio == xd->trig_mmio)
+			xd->trig_mmio = NULL;
+		xd->eoi_mmio = NULL;
+	}
+	if (xd->trig_mmio) {
+		iounmap(xd->trig_mmio);
+		xd->trig_mmio = NULL;
+	}
+}
+
+static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw)
+{
+	struct xive_irq_data *xd;
+	int rc;
+
+	xd = kzalloc(sizeof(struct xive_irq_data), GFP_KERNEL);
+	if (!xd)
+		return -ENOMEM;
+	rc = xive_ops->populate_irq_data(hw, xd);
+	if (rc) {
+		kfree(xd);
+		return rc;
+	}
+	xd->target = XIVE_INVALID_TARGET;
+	irq_set_handler_data(virq, xd);
+
+	return 0;
+}
+
+static void xive_irq_free_data(unsigned int virq)
+{
+	struct xive_irq_data *xd = irq_get_handler_data(virq);
+
+	if (!xd)
+		return;
+	irq_set_handler_data(virq, NULL);
+	xive_cleanup_irq_data(xd);
+	kfree(xd);
+}
+
+#ifdef CONFIG_SMP
+
+static void xive_cause_ipi(int cpu)
+{
+	struct xive_cpu *xc;
+	struct xive_irq_data *xd;
+
+	xc = per_cpu(xive_cpu, cpu);
+
+	DBG_VERBOSE("IPI CPU %d -> %d (HW IRQ 0x%x)\n",
+		    smp_processor_id(), cpu, xc->hw_ipi);
+
+	xd = &xc->ipi_data;
+	if (WARN_ON(!xd->trig_mmio))
+		return;
+	out_be64(xd->trig_mmio, 0);
+}
+
+static irqreturn_t xive_muxed_ipi_action(int irq, void *dev_id)
+{
+	return smp_ipi_demux();
+}
+
+static void xive_ipi_eoi(struct irq_data *d)
+{
+	struct xive_cpu *xc = __this_cpu_read(xive_cpu);
+
+	/* Handle possible race with unplug and drop stale IPIs */
+	if (!xc)
+		return;
+	xive_do_source_eoi(xc->hw_ipi, &xc->ipi_data);
+	xive_do_queue_eoi(xc);
+}
+
+static void xive_ipi_do_nothing(struct irq_data *d)
+{
+	/*
+	 * Nothing to do, we never mask/unmask IPIs, but the callback
+	 * has to exist for the struct irq_chip.
+	 */
+}
+
+static struct irq_chip xive_ipi_chip = {
+	.name = "XIVE-IPI",
+	.irq_eoi = xive_ipi_eoi,
+	.irq_mask = xive_ipi_do_nothing,
+	.irq_unmask = xive_ipi_do_nothing,
+};
+
+static void __init xive_request_ipi(void)
+{
+	unsigned int virq;
+
+	/*
+	 * Initialization failed, move on, we might manage to
+	 * reach the point where we display our errors before
+	 * the system falls appart
+	 */
+	if (!xive_irq_domain)
+		return;
+
+	/* Initialize it */
+	virq = irq_create_mapping(xive_irq_domain, 0);
+	xive_ipi_irq = virq;
+
+	WARN_ON(request_irq(virq, xive_muxed_ipi_action,
+			    IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL));
+}
+
+static int xive_setup_cpu_ipi(unsigned int cpu)
+{
+	struct xive_cpu *xc;
+	int rc;
+
+	pr_debug("Setting up IPI for CPU %d\n", cpu);
+
+	xc = per_cpu(xive_cpu, cpu);
+
+	/* Check if we are already setup */
+	if (xc->hw_ipi != 0)
+		return 0;
+
+	/* Grab an IPI from the backend, this will populate xc->hw_ipi */
+	if (xive_ops->get_ipi(cpu, xc))
+		return -EIO;
+
+	/*
+	 * Populate the IRQ data in the xive_cpu structure and
+	 * configure the HW / enable the IPIs.
+	 */
+	rc = xive_ops->populate_irq_data(xc->hw_ipi, &xc->ipi_data);
+	if (rc) {
+		pr_err("Failed to populate IPI data on CPU %d\n", cpu);
+		return -EIO;
+	}
+	rc = xive_ops->configure_irq(xc->hw_ipi,
+				     get_hard_smp_processor_id(cpu),
+				     xive_irq_priority, xive_ipi_irq);
+	if (rc) {
+		pr_err("Failed to map IPI CPU %d\n", cpu);
+		return -EIO;
+	}
+	pr_devel("CPU %d HW IPI %x, virq %d, trig_mmio=%p\n", cpu,
+	    xc->hw_ipi, xive_ipi_irq, xc->ipi_data.trig_mmio);
+
+	/* Unmask it */
+	xive_do_source_set_mask(&xc->ipi_data, false);
+
+	return 0;
+}
+
+static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc)
+{
+	/* Disable the IPI and free the IRQ data */
+
+	/* Already cleaned up ? */
+	if (xc->hw_ipi == 0)
+		return;
+
+	/* Mask the IPI */
+	xive_do_source_set_mask(&xc->ipi_data, true);
+
+	/*
+	 * Note: We don't call xive_cleanup_irq_data() to free
+	 * the mappings as this is called from an IPI on kexec
+	 * which is not a safe environment to call iounmap()
+	 */
+
+	/* Deconfigure/mask in the backend */
+	xive_ops->configure_irq(xc->hw_ipi, hard_smp_processor_id(),
+				0xff, xive_ipi_irq);
+
+	/* Free the IPIs in the backend */
+	xive_ops->put_ipi(cpu, xc);
+}
+
+void __init xive_smp_probe(void)
+{
+	smp_ops->cause_ipi = xive_cause_ipi;
+
+	/* Register the IPI */
+	xive_request_ipi();
+
+	/* Allocate and setup IPI for the boot CPU */
+	xive_setup_cpu_ipi(smp_processor_id());
+}
+
+#endif /* CONFIG_SMP */
+
+static int xive_irq_domain_map(struct irq_domain *h, unsigned int virq,
+			       irq_hw_number_t hw)
+{
+	int rc;
+
+	/*
+	 * Mark interrupts as edge sensitive by default so that resend
+	 * actually works. Will fix that up below if needed.
+	 */
+	irq_clear_status_flags(virq, IRQ_LEVEL);
+
+#ifdef CONFIG_SMP
+	/* IPIs are special and come up with HW number 0 */
+	if (hw == 0) {
+		/*
+		 * IPIs are marked per-cpu. We use separate HW interrupts under
+		 * the hood but associated with the same "linux" interrupt
+		 */
+		irq_set_chip_and_handler(virq, &xive_ipi_chip,
+					 handle_percpu_irq);
+		return 0;
+	}
+#endif
+
+	rc = xive_irq_alloc_data(virq, hw);
+	if (rc)
+		return rc;
+
+	irq_set_chip_and_handler(virq, &xive_irq_chip, handle_fasteoi_irq);
+
+	return 0;
+}
+
+static void xive_irq_domain_unmap(struct irq_domain *d, unsigned int virq)
+{
+	struct irq_data *data = irq_get_irq_data(virq);
+	unsigned int hw_irq;
+
+	/* XXX Assign BAD number */
+	if (!data)
+		return;
+	hw_irq = (unsigned int)irqd_to_hwirq(data);
+	if (hw_irq)
+		xive_irq_free_data(virq);
+}
+
+static int xive_irq_domain_xlate(struct irq_domain *h, struct device_node *ct,
+				 const u32 *intspec, unsigned int intsize,
+				 irq_hw_number_t *out_hwirq, unsigned int *out_flags)
+
+{
+	*out_hwirq = intspec[0];
+
+	/*
+	 * If intsize is at least 2, we look for the type in the second cell,
+	 * we assume the LSB indicates a level interrupt.
+	 */
+	if (intsize > 1) {
+		if (intspec[1] & 1)
+			*out_flags = IRQ_TYPE_LEVEL_LOW;
+		else
+			*out_flags = IRQ_TYPE_EDGE_RISING;
+	} else
+		*out_flags = IRQ_TYPE_LEVEL_LOW;
+
+	return 0;
+}
+
+static int xive_irq_domain_match(struct irq_domain *h, struct device_node *node,
+				 enum irq_domain_bus_token bus_token)
+{
+	return xive_ops->match(node);
+}
+
+static const struct irq_domain_ops xive_irq_domain_ops = {
+	.match = xive_irq_domain_match,
+	.map = xive_irq_domain_map,
+	.unmap = xive_irq_domain_unmap,
+	.xlate = xive_irq_domain_xlate,
+};
+
+static void __init xive_init_host(void)
+{
+	xive_irq_domain = irq_domain_add_nomap(NULL, XIVE_MAX_IRQ,
+					       &xive_irq_domain_ops, NULL);
+	if (WARN_ON(xive_irq_domain == NULL))
+		return;
+	irq_set_default_host(xive_irq_domain);
+}
+
+static void xive_cleanup_cpu_queues(unsigned int cpu, struct xive_cpu *xc)
+{
+	if (xc->queue[xive_irq_priority].qpage)
+		xive_ops->cleanup_queue(cpu, xc, xive_irq_priority);
+}
+
+static int xive_setup_cpu_queues(unsigned int cpu, struct xive_cpu *xc)
+{
+	int rc = 0;
+
+	/* We setup 1 queues for now with a 64k page */
+	if (!xc->queue[xive_irq_priority].qpage)
+		rc = xive_ops->setup_queue(cpu, xc, xive_irq_priority);
+
+	return rc;
+}
+
+static int xive_prepare_cpu(unsigned int cpu)
+{
+	struct xive_cpu *xc;
+
+	xc = per_cpu(xive_cpu, cpu);
+	if (!xc) {
+		struct device_node *np;
+
+		xc = kzalloc_node(sizeof(struct xive_cpu),
+				  GFP_KERNEL, cpu_to_node(cpu));
+		if (!xc)
+			return -ENOMEM;
+		np = of_get_cpu_node(cpu, NULL);
+		if (np)
+			xc->chip_id = of_get_ibm_chip_id(np);
+		of_node_put(np);
+
+		per_cpu(xive_cpu, cpu) = xc;
+	}
+
+	/* Setup EQs if not already */
+	return xive_setup_cpu_queues(cpu, xc);
+}
+
+static void xive_setup_cpu(void)
+{
+	struct xive_cpu *xc = __this_cpu_read(xive_cpu);
+
+	/* Debug: Dump the TM state */
+	pr_devel("CPU %d [HW 0x%02x] VT=%02x\n",
+	    smp_processor_id(), hard_smp_processor_id(),
+	    in_8(xive_tima + xive_tima_offset + TM_WORD2));
+
+	/* The backend might have additional things to do */
+	if (xive_ops->setup_cpu)
+		xive_ops->setup_cpu(smp_processor_id(), xc);
+
+	/* Set CPPR to 0xff to enable flow of interrupts */
+	xc->cppr = 0xff;
+	out_8(xive_tima + xive_tima_offset + TM_CPPR, 0xff);
+}
+
+#ifdef CONFIG_SMP
+void xive_smp_setup_cpu(void)
+{
+	pr_devel("SMP setup CPU %d\n", smp_processor_id());
+
+	/* This will have already been done on the boot CPU */
+	if (smp_processor_id() != boot_cpuid)
+		xive_setup_cpu();
+
+}
+
+int xive_smp_prepare_cpu(unsigned int cpu)
+{
+	int rc;
+
+	/* Allocate per-CPU data and queues */
+	rc = xive_prepare_cpu(cpu);
+	if (rc)
+		return rc;
+
+	/* Allocate and setup IPI for the new CPU */
+	return xive_setup_cpu_ipi(cpu);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void xive_flush_cpu_queue(unsigned int cpu, struct xive_cpu *xc)
+{
+	u32 irq;
+
+	/* We assume local irqs are disabled */
+	WARN_ON(!irqs_disabled());
+
+	/* Check what's already in the CPU queue */
+	while ((irq = xive_scan_interrupts(xc, false)) != 0) {
+		/*
+		 * We need to re-route that interrupt to its new destination.
+		 * First get and lock the descriptor
+		 */
+		struct irq_desc *desc = irq_to_desc(irq);
+		struct irq_data *d = irq_desc_get_irq_data(desc);
+		struct xive_irq_data *xd;
+		unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+
+		/*
+		 * Ignore anything that isn't a XIVE irq and ignore
+		 * IPIs, so can just be dropped.
+		 */
+		if (d->domain != xive_irq_domain || hw_irq == 0)
+			continue;
+
+		/*
+		 * The IRQ should have already been re-routed, it's just a
+		 * stale in the old queue, so re-trigger it in order to make
+		 * it reach is new destination.
+		 */
+#ifdef DEBUG_FLUSH
+		pr_info("CPU %d: Got irq %d while offline, re-sending...\n",
+			cpu, irq);
+#endif
+		raw_spin_lock(&desc->lock);
+		xd = irq_desc_get_handler_data(desc);
+
+		/*
+		 * For LSIs, we EOI, this will cause a resend if it's
+		 * still asserted. Otherwise do an MSI retrigger.
+		 */
+		if (xd->flags & XIVE_IRQ_FLAG_LSI)
+			xive_do_source_eoi(irqd_to_hwirq(d), xd);
+		else
+			xive_irq_retrigger(d);
+
+		raw_spin_unlock(&desc->lock);
+	}
+}
+
+void xive_smp_disable_cpu(void)
+{
+	struct xive_cpu *xc = __this_cpu_read(xive_cpu);
+	unsigned int cpu = smp_processor_id();
+
+	/* Migrate interrupts away from the CPU */
+	irq_migrate_all_off_this_cpu();
+
+	/* Set CPPR to 0 to disable flow of interrupts */
+	xc->cppr = 0;
+	out_8(xive_tima + xive_tima_offset + TM_CPPR, 0);
+
+	/* Flush everything still in the queue */
+	xive_flush_cpu_queue(cpu, xc);
+
+	/* Re-enable CPPR  */
+	xc->cppr = 0xff;
+	out_8(xive_tima + xive_tima_offset + TM_CPPR, 0xff);
+}
+
+void xive_flush_interrupt(void)
+{
+	struct xive_cpu *xc = __this_cpu_read(xive_cpu);
+	unsigned int cpu = smp_processor_id();
+
+	/* Called if an interrupt occurs while the CPU is hot unplugged */
+	xive_flush_cpu_queue(cpu, xc);
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
+#endif /* CONFIG_SMP */
+
+void xive_kexec_teardown_cpu(int secondary)
+{
+	struct xive_cpu *xc = __this_cpu_read(xive_cpu);
+	unsigned int cpu = smp_processor_id();
+
+	/* Set CPPR to 0 to disable flow of interrupts */
+	xc->cppr = 0;
+	out_8(xive_tima + xive_tima_offset + TM_CPPR, 0);
+
+	/* Backend cleanup if any */
+	if (xive_ops->teardown_cpu)
+		xive_ops->teardown_cpu(cpu, xc);
+
+#ifdef CONFIG_SMP
+	/* Get rid of IPI */
+	xive_cleanup_cpu_ipi(cpu, xc);
+#endif
+
+	/* Disable and free the queues */
+	xive_cleanup_cpu_queues(cpu, xc);
+}
+
+void xive_shutdown(void)
+{
+	xive_ops->shutdown();
+}
+
+bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset,
+		    u8 max_prio)
+{
+	xive_tima = area;
+	xive_tima_offset = offset;
+	xive_ops = ops;
+	xive_irq_priority = max_prio;
+
+	ppc_md.get_irq = xive_get_irq;
+	__xive_enabled = true;
+
+	pr_devel("Initializing host..\n");
+	xive_init_host();
+
+	pr_devel("Initializing boot CPU..\n");
+
+	/* Allocate per-CPU data and queues */
+	xive_prepare_cpu(smp_processor_id());
+
+	/* Get ready for interrupts */
+	xive_setup_cpu();
+
+	pr_info("Interrupt handling intialized with %s backend\n",
+		xive_ops->name);
+	pr_info("Using priority %d for all interrupts\n", max_prio);
+
+	return true;
+}
+
+static int __init xive_off(char *arg)
+{
+	xive_cmdline_disabled = true;
+	return 0;
+}
+__setup("xive=off", xive_off);
diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
new file mode 100644
index 000000000000..1a726229a427
--- /dev/null
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -0,0 +1,640 @@
+/*
+ * Copyright 2016,2017 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "xive: " fmt
+
+#include <linux/types.h>
+#include <linux/irq.h>
+#include <linux/debugfs.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/cpumask.h>
+#include <linux/mm.h>
+
+#include <asm/prom.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/irq.h>
+#include <asm/errno.h>
+#include <asm/xive.h>
+#include <asm/xive-regs.h>
+#include <asm/opal.h>
+
+#include "xive-internal.h"
+
+
+static u32 xive_provision_size;
+static u32 *xive_provision_chips;
+static u32 xive_provision_chip_count;
+static u32 xive_queue_shift;
+static u32 xive_pool_vps = XIVE_INVALID_VP;
+static struct kmem_cache *xive_provision_cache;
+
+int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data)
+{
+	__be64 flags, eoi_page, trig_page;
+	__be32 esb_shift, src_chip;
+	u64 opal_flags;
+	s64 rc;
+
+	memset(data, 0, sizeof(*data));
+
+	rc = opal_xive_get_irq_info(hw_irq, &flags, &eoi_page, &trig_page,
+				    &esb_shift, &src_chip);
+	if (rc) {
+		pr_err("opal_xive_get_irq_info(0x%x) returned %lld\n",
+		       hw_irq, rc);
+		return -EINVAL;
+	}
+
+	opal_flags = be64_to_cpu(flags);
+	if (opal_flags & OPAL_XIVE_IRQ_STORE_EOI)
+		data->flags |= XIVE_IRQ_FLAG_STORE_EOI;
+	if (opal_flags & OPAL_XIVE_IRQ_LSI)
+		data->flags |= XIVE_IRQ_FLAG_LSI;
+	if (opal_flags & OPAL_XIVE_IRQ_SHIFT_BUG)
+		data->flags |= XIVE_IRQ_FLAG_SHIFT_BUG;
+	if (opal_flags & OPAL_XIVE_IRQ_MASK_VIA_FW)
+		data->flags |= XIVE_IRQ_FLAG_MASK_FW;
+	if (opal_flags & OPAL_XIVE_IRQ_EOI_VIA_FW)
+		data->flags |= XIVE_IRQ_FLAG_EOI_FW;
+	data->eoi_page = be64_to_cpu(eoi_page);
+	data->trig_page = be64_to_cpu(trig_page);
+	data->esb_shift = be32_to_cpu(esb_shift);
+	data->src_chip = be32_to_cpu(src_chip);
+
+	data->eoi_mmio = ioremap(data->eoi_page, 1u << data->esb_shift);
+	if (!data->eoi_mmio) {
+		pr_err("Failed to map EOI page for irq 0x%x\n", hw_irq);
+		return -ENOMEM;
+	}
+
+	if (!data->trig_page)
+		return 0;
+	if (data->trig_page == data->eoi_page) {
+		data->trig_mmio = data->eoi_mmio;
+		return 0;
+	}
+
+	data->trig_mmio = ioremap(data->trig_page, 1u << data->esb_shift);
+	if (!data->trig_mmio) {
+		pr_err("Failed to map trigger page for irq 0x%x\n", hw_irq);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq)
+{
+	s64 rc;
+
+	for (;;) {
+		rc = opal_xive_set_irq_config(hw_irq, target, prio, sw_irq);
+		if (rc != OPAL_BUSY)
+			break;
+		msleep(1);
+	}
+	return rc == 0 ? 0 : -ENXIO;
+}
+
+/* This can be called multiple time to change a queue configuration */
+int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
+				__be32 *qpage, u32 order, bool can_escalate)
+{
+	s64 rc = 0;
+	__be64 qeoi_page_be;
+	__be32 esc_irq_be;
+	u64 flags, qpage_phys;
+
+	/* If there's an actual queue page, clean it */
+	if (order) {
+		if (WARN_ON(!qpage))
+			return -EINVAL;
+		qpage_phys = __pa(qpage);
+	} else
+		qpage_phys = 0;
+
+	/* Initialize the rest of the fields */
+	q->msk = order ? ((1u << (order - 2)) - 1) : 0;
+	q->idx = 0;
+	q->toggle = 0;
+
+	rc = opal_xive_get_queue_info(vp_id, prio, NULL, NULL,
+				      &qeoi_page_be,
+				      &esc_irq_be,
+				      NULL);
+	if (rc) {
+		pr_err("Error %lld getting queue info prio %d\n", rc, prio);
+		rc = -EIO;
+		goto fail;
+	}
+	q->eoi_phys = be64_to_cpu(qeoi_page_be);
+
+	/* Default flags */
+	flags = OPAL_XIVE_EQ_ALWAYS_NOTIFY | OPAL_XIVE_EQ_ENABLED;
+
+	/* Escalation needed ? */
+	if (can_escalate) {
+		q->esc_irq = be32_to_cpu(esc_irq_be);
+		flags |= OPAL_XIVE_EQ_ESCALATE;
+	}
+
+	/* Configure and enable the queue in HW */
+	for (;;) {
+		rc = opal_xive_set_queue_info(vp_id, prio, qpage_phys, order, flags);
+		if (rc != OPAL_BUSY)
+			break;
+		msleep(1);
+	}
+	if (rc) {
+		pr_err("Error %lld setting queue for prio %d\n", rc, prio);
+		rc = -EIO;
+	} else {
+		/*
+		 * KVM code requires all of the above to be visible before
+		 * q->qpage is set due to how it manages IPI EOIs
+		 */
+		wmb();
+		q->qpage = qpage;
+	}
+fail:
+	return rc;
+}
+
+static void __xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio)
+{
+	s64 rc;
+
+	/* Disable the queue in HW */
+	for (;;) {
+		rc = opal_xive_set_queue_info(vp_id, prio, 0, 0, 0);
+		if (rc != OPAL_BUSY)
+			break;
+		msleep(1);
+	}
+	if (rc)
+		pr_err("Error %lld disabling queue for prio %d\n", rc, prio);
+}
+
+void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio)
+{
+	__xive_native_disable_queue(vp_id, q, prio);
+}
+
+static int xive_native_setup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio)
+{
+	struct xive_q *q = &xc->queue[prio];
+	unsigned int alloc_order;
+	struct page *pages;
+	__be32 *qpage;
+
+	alloc_order = (xive_queue_shift > PAGE_SHIFT) ?
+		(xive_queue_shift - PAGE_SHIFT) : 0;
+	pages = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, alloc_order);
+	if (!pages)
+		return -ENOMEM;
+	qpage = (__be32 *)page_address(pages);
+	memset(qpage, 0, 1 << xive_queue_shift);
+	return xive_native_configure_queue(get_hard_smp_processor_id(cpu),
+					   q, prio, qpage, xive_queue_shift, false);
+}
+
+static void xive_native_cleanup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio)
+{
+	struct xive_q *q = &xc->queue[prio];
+	unsigned int alloc_order;
+
+	/*
+	 * We use the variant with no iounmap as this is called on exec
+	 * from an IPI and iounmap isn't safe
+	 */
+	__xive_native_disable_queue(get_hard_smp_processor_id(cpu), q, prio);
+	alloc_order = (xive_queue_shift > PAGE_SHIFT) ?
+		(xive_queue_shift - PAGE_SHIFT) : 0;
+	free_pages((unsigned long)q->qpage, alloc_order);
+	q->qpage = NULL;
+}
+
+static bool xive_native_match(struct device_node *node)
+{
+	return of_device_is_compatible(node, "ibm,opal-xive-vc");
+}
+
+#ifdef CONFIG_SMP
+static int xive_native_get_ipi(unsigned int cpu, struct xive_cpu *xc)
+{
+	struct device_node *np;
+	unsigned int chip_id;
+	s64 irq;
+
+	/* Find the chip ID */
+	np = of_get_cpu_node(cpu, NULL);
+	if (np) {
+		if (of_property_read_u32(np, "ibm,chip-id", &chip_id) < 0)
+			chip_id = 0;
+	}
+
+	/* Allocate an IPI and populate info about it */
+	for (;;) {
+		irq = opal_xive_allocate_irq(chip_id);
+		if (irq == OPAL_BUSY) {
+			msleep(1);
+			continue;
+		}
+		if (irq < 0) {
+			pr_err("Failed to allocate IPI on CPU %d\n", cpu);
+			return -ENXIO;
+		}
+		xc->hw_ipi = irq;
+		break;
+	}
+	return 0;
+}
+
+u32 xive_native_alloc_irq(void)
+{
+	s64 rc;
+
+	for (;;) {
+		rc = opal_xive_allocate_irq(OPAL_XIVE_ANY_CHIP);
+		if (rc != OPAL_BUSY)
+			break;
+		msleep(1);
+	}
+	if (rc < 0)
+		return 0;
+	return rc;
+}
+
+void xive_native_free_irq(u32 irq)
+{
+	for (;;) {
+		s64 rc = opal_xive_free_irq(irq);
+		if (rc != OPAL_BUSY)
+			break;
+		msleep(1);
+	}
+}
+
+static void xive_native_put_ipi(unsigned int cpu, struct xive_cpu *xc)
+{
+	s64 rc;
+
+	/* Free the IPI */
+	if (!xc->hw_ipi)
+		return;
+	for (;;) {
+		rc = opal_xive_free_irq(xc->hw_ipi);
+		if (rc == OPAL_BUSY) {
+			msleep(1);
+			continue;
+		}
+		xc->hw_ipi = 0;
+		break;
+	}
+}
+#endif /* CONFIG_SMP */
+
+static void xive_native_shutdown(void)
+{
+	/* Switch the XIVE to emulation mode */
+	opal_xive_reset(OPAL_XIVE_MODE_EMU);
+}
+
+/*
+ * Perform an "ack" cycle on the current thread, thus
+ * grabbing the pending active priorities and updating
+ * the CPPR to the most favored one.
+ */
+static void xive_native_update_pending(struct xive_cpu *xc)
+{
+	u8 he, cppr;
+	u16 ack;
+
+	/* Perform the acknowledge hypervisor to register cycle */
+	ack = be16_to_cpu(__raw_readw(xive_tima + TM_SPC_ACK_HV_REG));
+
+	/* Synchronize subsequent queue accesses */
+	mb();
+
+	/*
+	 * Grab the CPPR and the "HE" field which indicates the source
+	 * of the hypervisor interrupt (if any)
+	 */
+	cppr = ack & 0xff;
+	he = GETFIELD(TM_QW3_NSR_HE, (ack >> 8));
+	switch(he) {
+	case TM_QW3_NSR_HE_NONE: /* Nothing to see here */
+		break;
+	case TM_QW3_NSR_HE_PHYS: /* Physical thread interrupt */
+		if (cppr == 0xff)
+			return;
+		/* Mark the priority pending */
+		xc->pending_prio |= 1 << cppr;
+
+		/*
+		 * A new interrupt should never have a CPPR less favored
+		 * than our current one.
+		 */
+		if (cppr >= xc->cppr)
+			pr_err("CPU %d odd ack CPPR, got %d at %d\n",
+			       smp_processor_id(), cppr, xc->cppr);
+
+		/* Update our idea of what the CPPR is */
+		xc->cppr = cppr;
+		break;
+	case TM_QW3_NSR_HE_POOL: /* HV Pool interrupt (unused) */
+	case TM_QW3_NSR_HE_LSI:  /* Legacy FW LSI (unused) */
+		pr_err("CPU %d got unexpected interrupt type HE=%d\n",
+		       smp_processor_id(), he);
+		return;
+	}
+}
+
+static void xive_native_eoi(u32 hw_irq)
+{
+	/*
+	 * Not normally used except if specific interrupts need
+	 * a workaround on EOI.
+	 */
+	opal_int_eoi(hw_irq);
+}
+
+static void xive_native_setup_cpu(unsigned int cpu, struct xive_cpu *xc)
+{
+	s64 rc;
+	u32 vp;
+	__be64 vp_cam_be;
+	u64 vp_cam;
+
+	if (xive_pool_vps == XIVE_INVALID_VP)
+		return;
+
+	/* Enable the pool VP */
+	vp = xive_pool_vps + get_hard_smp_processor_id(cpu);
+	pr_debug("CPU %d setting up pool VP 0x%x\n", cpu, vp);
+	for (;;) {
+		rc = opal_xive_set_vp_info(vp, OPAL_XIVE_VP_ENABLED, 0);
+		if (rc != OPAL_BUSY)
+			break;
+		msleep(1);
+	}
+	if (rc) {
+		pr_err("Failed to enable pool VP on CPU %d\n", cpu);
+		return;
+	}
+
+	/* Grab it's CAM value */
+	rc = opal_xive_get_vp_info(vp, NULL, &vp_cam_be, NULL, NULL);
+	if (rc) {
+		pr_err("Failed to get pool VP info CPU %d\n", cpu);
+		return;
+	}
+	vp_cam = be64_to_cpu(vp_cam_be);
+
+	pr_debug("VP CAM = %llx\n", vp_cam);
+
+	/* Push it on the CPU (set LSMFB to 0xff to skip backlog scan) */
+	pr_debug("(Old HW value: %08x)\n",
+		 in_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2));
+	out_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD0, 0xff);
+	out_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2,
+		 TM_QW2W2_VP | vp_cam);
+	pr_debug("(New HW value: %08x)\n",
+		 in_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2));
+}
+
+static void xive_native_teardown_cpu(unsigned int cpu, struct xive_cpu *xc)
+{
+	s64 rc;
+	u32 vp;
+
+	if (xive_pool_vps == XIVE_INVALID_VP)
+		return;
+
+	/* Pull the pool VP from the CPU */
+	in_be64(xive_tima + TM_SPC_PULL_POOL_CTX);
+
+	/* Disable it */
+	vp = xive_pool_vps + get_hard_smp_processor_id(cpu);
+	for (;;) {
+		rc = opal_xive_set_vp_info(vp, 0, 0);
+		if (rc != OPAL_BUSY)
+			break;
+		msleep(1);
+	}
+}
+
+static void xive_native_sync_source(u32 hw_irq)
+{
+	opal_xive_sync(XIVE_SYNC_EAS, hw_irq);
+}
+
+static const struct xive_ops xive_native_ops = {
+	.populate_irq_data	= xive_native_populate_irq_data,
+	.configure_irq		= xive_native_configure_irq,
+	.setup_queue		= xive_native_setup_queue,
+	.cleanup_queue		= xive_native_cleanup_queue,
+	.match			= xive_native_match,
+	.shutdown		= xive_native_shutdown,
+	.update_pending		= xive_native_update_pending,
+	.eoi			= xive_native_eoi,
+	.setup_cpu		= xive_native_setup_cpu,
+	.teardown_cpu		= xive_native_teardown_cpu,
+	.sync_source		= xive_native_sync_source,
+#ifdef CONFIG_SMP
+	.get_ipi		= xive_native_get_ipi,
+	.put_ipi		= xive_native_put_ipi,
+#endif /* CONFIG_SMP */
+	.name			= "native",
+};
+
+static bool xive_parse_provisioning(struct device_node *np)
+{
+	int rc;
+
+	if (of_property_read_u32(np, "ibm,xive-provision-page-size",
+				 &xive_provision_size) < 0)
+		return true;
+	rc = of_property_count_elems_of_size(np, "ibm,xive-provision-chips", 4);
+	if (rc < 0) {
+		pr_err("Error %d getting provision chips array\n", rc);
+		return false;
+	}
+	xive_provision_chip_count = rc;
+	if (rc == 0)
+		return true;
+
+	xive_provision_chips = kzalloc(4 * xive_provision_chip_count,
+				       GFP_KERNEL);
+	if (WARN_ON(!xive_provision_chips))
+		return false;
+
+	rc = of_property_read_u32_array(np, "ibm,xive-provision-chips",
+					xive_provision_chips,
+					xive_provision_chip_count);
+	if (rc < 0) {
+		pr_err("Error %d reading provision chips array\n", rc);
+		return false;
+	}
+
+	xive_provision_cache = kmem_cache_create("xive-provision",
+						 xive_provision_size,
+						 xive_provision_size,
+						 0, NULL);
+	if (!xive_provision_cache) {
+		pr_err("Failed to allocate provision cache\n");
+		return false;
+	}
+	return true;
+}
+
+u32 xive_native_default_eq_shift(void)
+{
+	return xive_queue_shift;
+}
+
+bool xive_native_init(void)
+{
+	struct device_node *np;
+	struct resource r;
+	void __iomem *tima;
+	struct property *prop;
+	u8 max_prio = 7;
+	const __be32 *p;
+	u32 val;
+	s64 rc;
+
+	if (xive_cmdline_disabled)
+		return false;
+
+	pr_devel("xive_native_init()\n");
+	np = of_find_compatible_node(NULL, NULL, "ibm,opal-xive-pe");
+	if (!np) {
+		pr_devel("not found !\n");
+		return false;
+	}
+	pr_devel("Found %s\n", np->full_name);
+
+	/* Resource 1 is HV window */
+	if (of_address_to_resource(np, 1, &r)) {
+		pr_err("Failed to get thread mgmnt area resource\n");
+		return false;
+	}
+	tima = ioremap(r.start, resource_size(&r));
+	if (!tima) {
+		pr_err("Failed to map thread mgmnt area\n");
+		return false;
+	}
+
+	/* Read number of priorities */
+	if (of_property_read_u32(np, "ibm,xive-#priorities", &val) == 0)
+		max_prio = val - 1;
+
+	/* Iterate the EQ sizes and pick one */
+	of_property_for_each_u32(np, "ibm,xive-eq-sizes", prop, p, val) {
+		xive_queue_shift = val;
+		if (val == PAGE_SHIFT)
+			break;
+	}
+
+	/* Grab size of provisioning pages */
+	xive_parse_provisioning(np);
+
+	/* Switch the XIVE to exploitation mode */
+	rc = opal_xive_reset(OPAL_XIVE_MODE_EXPL);
+	if (rc) {
+		pr_err("Switch to exploitation mode failed with error %lld\n", rc);
+		return false;
+	}
+
+	/* Initialize XIVE core with our backend */
+	if (!xive_core_init(&xive_native_ops, tima, TM_QW3_HV_PHYS,
+			    max_prio)) {
+		opal_xive_reset(OPAL_XIVE_MODE_EMU);
+		return false;
+	}
+	pr_info("Using %dkB queues\n", 1 << (xive_queue_shift - 10));
+	return true;
+}
+
+static bool xive_native_provision_pages(void)
+{
+	u32 i;
+	void *p;
+
+	for (i = 0; i < xive_provision_chip_count; i++) {
+		u32 chip = xive_provision_chips[i];
+
+		/*
+		 * XXX TODO: Try to make the allocation local to the node where
+		 * the chip resides.
+		 */
+		p = kmem_cache_alloc(xive_provision_cache, GFP_KERNEL);
+		if (!p) {
+			pr_err("Failed to allocate provisioning page\n");
+			return false;
+		}
+		opal_xive_donate_page(chip, __pa(p));
+	}
+	return true;
+}
+
+u32 xive_native_alloc_vp_block(u32 max_vcpus)
+{
+	s64 rc;
+	u32 order;
+
+	order = fls(max_vcpus) - 1;
+	if (max_vcpus > (1 << order))
+		order++;
+
+	pr_info("VP block alloc, for max VCPUs %d use order %d\n",
+		max_vcpus, order);
+
+	for (;;) {
+		rc = opal_xive_alloc_vp_block(order);
+		switch (rc) {
+		case OPAL_BUSY:
+			msleep(1);
+			break;
+		case OPAL_XIVE_PROVISIONING:
+			if (!xive_native_provision_pages())
+				return XIVE_INVALID_VP;
+			break;
+		default:
+			if (rc < 0) {
+				pr_err("OPAL failed to allocate VCPUs order %d, err %lld\n",
+				       order, rc);
+				return XIVE_INVALID_VP;
+			}
+			return rc;
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(xive_native_alloc_vp_block);
+
+void xive_native_free_vp_block(u32 vp_base)
+{
+	s64 rc;
+
+	if (vp_base == XIVE_INVALID_VP)
+		return;
+
+	rc = opal_xive_free_vp_block(vp_base);
+	if (rc < 0)
+		pr_warn("OPAL error %lld freeing VP block\n", rc);
+}
+EXPORT_SYMBOL_GPL(xive_native_free_vp_block);
diff --git a/arch/powerpc/sysdev/xive/xive-internal.h b/arch/powerpc/sysdev/xive/xive-internal.h
new file mode 100644
index 000000000000..d07ef2d29caf
--- /dev/null
+++ b/arch/powerpc/sysdev/xive/xive-internal.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2016,2017 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef __XIVE_INTERNAL_H
+#define __XIVE_INTERNAL_H
+
+/* Each CPU carry one of these with various per-CPU state */
+struct xive_cpu {
+#ifdef CONFIG_SMP
+	/* HW irq number and data of IPI */
+	u32 hw_ipi;
+	struct xive_irq_data ipi_data;
+#endif /* CONFIG_SMP */
+
+	int chip_id;
+
+	/* Queue datas. Only one is populated */
+#define XIVE_MAX_QUEUES	8
+	struct xive_q queue[XIVE_MAX_QUEUES];
+
+	/*
+	 * Pending mask. Each bit corresponds to a priority that
+	 * potentially has pending interrupts.
+	 */
+	u8 pending_prio;
+
+	/* Cache of HW CPPR */
+	u8 cppr;
+};
+
+/* Backend ops */
+struct xive_ops {
+	int	(*populate_irq_data)(u32 hw_irq, struct xive_irq_data *data);
+	int 	(*configure_irq)(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
+	int	(*setup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio);
+	void	(*cleanup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio);
+	void	(*setup_cpu)(unsigned int cpu, struct xive_cpu *xc);
+	void	(*teardown_cpu)(unsigned int cpu, struct xive_cpu *xc);
+	bool	(*match)(struct device_node *np);
+	void	(*shutdown)(void);
+
+	void	(*update_pending)(struct xive_cpu *xc);
+	void	(*eoi)(u32 hw_irq);
+	void	(*sync_source)(u32 hw_irq);
+#ifdef CONFIG_SMP
+	int	(*get_ipi)(unsigned int cpu, struct xive_cpu *xc);
+	void	(*put_ipi)(unsigned int cpu, struct xive_cpu *xc);
+#endif
+	const char *name;
+};
+
+bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset,
+		    u8 max_prio);
+
+extern bool xive_cmdline_disabled;
+
+#endif /*  __XIVE_INTERNAL_H */
diff --git a/arch/powerpc/scripts/gcc-check-mprofile-kernel.sh b/arch/powerpc/tools/gcc-check-mprofile-kernel.sh
index c658d8cf760b..c658d8cf760b 100755
--- a/arch/powerpc/scripts/gcc-check-mprofile-kernel.sh
+++ b/arch/powerpc/tools/gcc-check-mprofile-kernel.sh
diff --git a/arch/powerpc/relocs_check.sh b/arch/powerpc/tools/relocs_check.sh
index ec2d5c835170..ec2d5c835170 100755
--- a/arch/powerpc/relocs_check.sh
+++ b/arch/powerpc/tools/relocs_check.sh
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 16321ad9e70c..f11f65634aab 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -29,7 +29,9 @@
 #include <linux/nmi.h>
 #include <linux/ctype.h>
 
+#include <asm/debugfs.h>
 #include <asm/ptrace.h>
+#include <asm/smp.h>
 #include <asm/string.h>
 #include <asm/prom.h>
 #include <asm/machdep.h>
@@ -48,7 +50,7 @@
 #include <asm/reg.h>
 #include <asm/debug.h>
 #include <asm/hw_breakpoint.h>
-
+#include <asm/xive.h>
 #include <asm/opal.h>
 #include <asm/firmware.h>
 
@@ -76,6 +78,7 @@ static int xmon_gate;
 #endif /* CONFIG_SMP */
 
 static unsigned long in_xmon __read_mostly = 0;
+static int xmon_on = IS_ENABLED(CONFIG_XMON_DEFAULT);
 
 static unsigned long adrs;
 static int size = 1;
@@ -184,8 +187,6 @@ static void dump_tlb_44x(void);
 static void dump_tlb_book3e(void);
 #endif
 
-static int xmon_no_auto_backtrace;
-
 #ifdef CONFIG_PPC64
 #define REG		"%.16lx"
 #else
@@ -232,7 +233,13 @@ Commands:\n\
   "\
   dr	dump stream of raw bytes\n\
   dt	dump the tracing buffers (uses printk)\n\
-  e	print exception information\n\
+"
+#ifdef CONFIG_PPC_POWERNV
+"  dx#   dump xive on CPU #\n\
+  dxi#  dump xive irq state #\n\
+  dxa   dump xive on all CPUs\n"
+#endif
+"  e	print exception information\n\
   f	flush cache\n\
   la	lookup symbol+offset of specified address\n\
   ls	lookup address of specified symbol\n\
@@ -411,7 +418,22 @@ int cpus_are_in_xmon(void)
 {
 	return !cpumask_empty(&cpus_in_xmon);
 }
-#endif
+
+static bool wait_for_other_cpus(int ncpus)
+{
+	unsigned long timeout;
+
+	/* We wait for 2s, which is a metric "little while" */
+	for (timeout = 20000; timeout != 0; --timeout) {
+		if (cpumask_weight(&cpus_in_xmon) >= ncpus)
+			return true;
+		udelay(100);
+		barrier();
+	}
+
+	return false;
+}
+#endif /* CONFIG_SMP */
 
 static inline int unrecoverable_excp(struct pt_regs *regs)
 {
@@ -433,7 +455,6 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
 #ifdef CONFIG_SMP
 	int cpu;
 	int secondary;
-	unsigned long timeout;
 #endif
 
 	local_irq_save(flags);
@@ -520,13 +541,17 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
 		xmon_owner = cpu;
 		mb();
 		if (ncpus > 1) {
-			smp_send_debugger_break();
-			/* wait for other cpus to come in */
-			for (timeout = 100000000; timeout != 0; --timeout) {
-				if (cpumask_weight(&cpus_in_xmon) >= ncpus)
-					break;
-				barrier();
-			}
+			/*
+			 * A system reset (trap == 0x100) can be triggered on
+			 * all CPUs, so when we come in via 0x100 try waiting
+			 * for the other CPUs to come in before we send the
+			 * debugger break (IPI). This is similar to
+			 * crash_kexec_secondary().
+			 */
+			if (TRAP(regs) != 0x100 || !wait_for_other_cpus(ncpus))
+				smp_send_debugger_break();
+
+			wait_for_other_cpus(ncpus);
 		}
 		remove_bpts();
 		disable_surveillance();
@@ -884,10 +909,7 @@ cmds(struct pt_regs *excp)
 	last_cmd = NULL;
 	xmon_regs = excp;
 
-	if (!xmon_no_auto_backtrace) {
-		xmon_no_auto_backtrace = 1;
-		xmon_show_stack(excp->gpr[1], excp->link, excp->nip);
-	}
+	xmon_show_stack(excp->gpr[1], excp->link, excp->nip);
 
 	for(;;) {
 #ifdef CONFIG_SMP
@@ -1347,9 +1369,19 @@ const char *getvecname(unsigned long vec)
 	case 0x100:	ret = "(System Reset)"; break;
 	case 0x200:	ret = "(Machine Check)"; break;
 	case 0x300:	ret = "(Data Access)"; break;
-	case 0x380:	ret = "(Data SLB Access)"; break;
+	case 0x380:
+		if (radix_enabled())
+			ret = "(Data Access Out of Range)";
+		else
+			ret = "(Data SLB Access)";
+		break;
 	case 0x400:	ret = "(Instruction Access)"; break;
-	case 0x480:	ret = "(Instruction SLB Access)"; break;
+	case 0x480:
+		if (radix_enabled())
+			ret = "(Instruction Access Out of Range)";
+		else
+			ret = "(Instruction SLB Access)";
+		break;
 	case 0x500:	ret = "(Hardware Interrupt)"; break;
 	case 0x600:	ret = "(Alignment)"; break;
 	case 0x700:	ret = "(Program Check)"; break;
@@ -2231,7 +2263,9 @@ static void dump_one_paca(int cpu)
 	DUMP(p, kernel_msr, "lx");
 	DUMP(p, emergency_sp, "p");
 #ifdef CONFIG_PPC_BOOK3S_64
+	DUMP(p, nmi_emergency_sp, "p");
 	DUMP(p, mc_emergency_sp, "p");
+	DUMP(p, in_nmi, "x");
 	DUMP(p, in_mce, "x");
 	DUMP(p, hmi_event_available, "x");
 #endif
@@ -2338,6 +2372,81 @@ static void dump_pacas(void)
 }
 #endif
 
+#ifdef CONFIG_PPC_POWERNV
+static void dump_one_xive(int cpu)
+{
+	unsigned int hwid = get_hard_smp_processor_id(cpu);
+
+	opal_xive_dump(XIVE_DUMP_TM_HYP, hwid);
+	opal_xive_dump(XIVE_DUMP_TM_POOL, hwid);
+	opal_xive_dump(XIVE_DUMP_TM_OS, hwid);
+	opal_xive_dump(XIVE_DUMP_TM_USER, hwid);
+	opal_xive_dump(XIVE_DUMP_VP, hwid);
+	opal_xive_dump(XIVE_DUMP_EMU_STATE, hwid);
+
+	if (setjmp(bus_error_jmp) != 0) {
+		catch_memory_errors = 0;
+		printf("*** Error dumping xive on cpu %d\n", cpu);
+		return;
+	}
+
+	catch_memory_errors = 1;
+	sync();
+	xmon_xive_do_dump(cpu);
+	sync();
+	__delay(200);
+	catch_memory_errors = 0;
+}
+
+static void dump_all_xives(void)
+{
+	int cpu;
+
+	if (num_possible_cpus() == 0) {
+		printf("No possible cpus, use 'dx #' to dump individual cpus\n");
+		return;
+	}
+
+	for_each_possible_cpu(cpu)
+		dump_one_xive(cpu);
+}
+
+static void dump_one_xive_irq(u32 num)
+{
+	s64 rc;
+	__be64 vp;
+	u8 prio;
+	__be32 lirq;
+
+	rc = opal_xive_get_irq_config(num, &vp, &prio, &lirq);
+	xmon_printf("IRQ 0x%x config: vp=0x%llx prio=%d lirq=0x%x (rc=%lld)\n",
+		    num, be64_to_cpu(vp), prio, be32_to_cpu(lirq), rc);
+}
+
+static void dump_xives(void)
+{
+	unsigned long num;
+	int c;
+
+	c = inchar();
+	if (c == 'a') {
+		dump_all_xives();
+		return;
+	} else if (c == 'i') {
+		if (scanhex(&num))
+			dump_one_xive_irq(num);
+		return;
+	}
+
+	termch = c;	/* Put c back, it wasn't 'a' */
+
+	if (scanhex(&num))
+		dump_one_xive(num);
+	else
+		dump_one_xive(xmon_owner);
+}
+#endif /* CONFIG_PPC_POWERNV */
+
 static void dump_by_size(unsigned long addr, long count, int size)
 {
 	unsigned char temp[16];
@@ -2386,6 +2495,14 @@ dump(void)
 		return;
 	}
 #endif
+#ifdef CONFIG_PPC_POWERNV
+	if (c == 'x') {
+		xmon_start_pagination();
+		dump_xives();
+		xmon_end_pagination();
+		return;
+	}
+#endif
 
 	if (c == '\n')
 		termch = c;
@@ -3070,23 +3187,28 @@ void dump_segments(void)
 	for (i = 0; i < mmu_slb_size; i++) {
 		asm volatile("slbmfee  %0,%1" : "=r" (esid) : "r" (i));
 		asm volatile("slbmfev  %0,%1" : "=r" (vsid) : "r" (i));
-		if (esid || vsid) {
-			printf("%02d %016lx %016lx", i, esid, vsid);
-			if (esid & SLB_ESID_V) {
-				llp = vsid & SLB_VSID_LLP;
-				if (vsid & SLB_VSID_B_1T) {
-					printf("  1T  ESID=%9lx  VSID=%13lx LLP:%3lx \n",
-						GET_ESID_1T(esid),
-						(vsid & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T,
-						llp);
-				} else {
-					printf(" 256M ESID=%9lx  VSID=%13lx LLP:%3lx \n",
-						GET_ESID(esid),
-						(vsid & ~SLB_VSID_B) >> SLB_VSID_SHIFT,
-						llp);
-				}
-			} else
-				printf("\n");
+
+		if (!esid && !vsid)
+			continue;
+
+		printf("%02d %016lx %016lx", i, esid, vsid);
+
+		if (!(esid & SLB_ESID_V)) {
+			printf("\n");
+			continue;
+		}
+
+		llp = vsid & SLB_VSID_LLP;
+		if (vsid & SLB_VSID_B_1T) {
+			printf("  1T  ESID=%9lx  VSID=%13lx LLP:%3lx \n",
+				GET_ESID_1T(esid),
+				(vsid & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T,
+				llp);
+		} else {
+			printf(" 256M ESID=%9lx  VSID=%13lx LLP:%3lx \n",
+				GET_ESID(esid),
+				(vsid & ~SLB_VSID_B) >> SLB_VSID_SHIFT,
+				llp);
 		}
 	}
 }
@@ -3302,6 +3424,8 @@ static void sysrq_handle_xmon(int key)
 	/* ensure xmon is enabled */
 	xmon_init(1);
 	debugger(get_irq_regs());
+	if (!xmon_on)
+		xmon_init(0);
 }
 
 static struct sysrq_key_op sysrq_xmon_op = {
@@ -3315,10 +3439,37 @@ static int __init setup_xmon_sysrq(void)
 	register_sysrq_key('x', &sysrq_xmon_op);
 	return 0;
 }
-__initcall(setup_xmon_sysrq);
+device_initcall(setup_xmon_sysrq);
 #endif /* CONFIG_MAGIC_SYSRQ */
 
-static int __initdata xmon_early, xmon_off;
+#ifdef CONFIG_DEBUG_FS
+static int xmon_dbgfs_set(void *data, u64 val)
+{
+	xmon_on = !!val;
+	xmon_init(xmon_on);
+
+	return 0;
+}
+
+static int xmon_dbgfs_get(void *data, u64 *val)
+{
+	*val = xmon_on;
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(xmon_dbgfs_ops, xmon_dbgfs_get,
+			xmon_dbgfs_set, "%llu\n");
+
+static int __init setup_xmon_dbgfs(void)
+{
+	debugfs_create_file("xmon", 0600, powerpc_debugfs_root, NULL,
+				&xmon_dbgfs_ops);
+	return 0;
+}
+device_initcall(setup_xmon_dbgfs);
+#endif /* CONFIG_DEBUG_FS */
+
+static int xmon_early __initdata;
 
 static int __init early_parse_xmon(char *p)
 {
@@ -3326,12 +3477,12 @@ static int __init early_parse_xmon(char *p)
 		/* just "xmon" is equivalent to "xmon=early" */
 		xmon_init(1);
 		xmon_early = 1;
-	} else if (strncmp(p, "on", 2) == 0)
+		xmon_on = 1;
+	} else if (strncmp(p, "on", 2) == 0) {
 		xmon_init(1);
-	else if (strncmp(p, "off", 3) == 0)
-		xmon_off = 1;
-	else if (strncmp(p, "nobt", 4) == 0)
-		xmon_no_auto_backtrace = 1;
+		xmon_on = 1;
+	} else if (strncmp(p, "off", 3) == 0)
+		xmon_on = 0;
 	else
 		return 1;
 
@@ -3341,10 +3492,8 @@ early_param("xmon", early_parse_xmon);
 
 void __init xmon_setup(void)
 {
-#ifdef CONFIG_XMON_DEFAULT
-	if (!xmon_off)
+	if (xmon_on)
 		xmon_init(1);
-#endif
 	if (xmon_early)
 		debugger(NULL);
 }
diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c
index bcc030eacab7..1a138c83f877 100644
--- a/drivers/misc/cxl/api.c
+++ b/drivers/misc/cxl/api.c
@@ -14,6 +14,7 @@
 #include <linux/msi.h>
 #include <linux/module.h>
 #include <linux/mount.h>
+#include <linux/sched/mm.h>
 
 #include "cxl.h"
 
@@ -321,19 +322,29 @@ int cxl_start_context(struct cxl_context *ctx, u64 wed,
 
 	if (task) {
 		ctx->pid = get_task_pid(task, PIDTYPE_PID);
-		ctx->glpid = get_task_pid(task->group_leader, PIDTYPE_PID);
 		kernel = false;
 		ctx->real_mode = false;
+
+		/* acquire a reference to the task's mm */
+		ctx->mm = get_task_mm(current);
+
+		/* ensure this mm_struct can't be freed */
+		cxl_context_mm_count_get(ctx);
+
+		/* decrement the use count */
+		if (ctx->mm)
+			mmput(ctx->mm);
 	}
 
 	cxl_ctx_get();
 
 	if ((rc = cxl_ops->attach_process(ctx, kernel, wed, 0))) {
-		put_pid(ctx->glpid);
 		put_pid(ctx->pid);
-		ctx->glpid = ctx->pid = NULL;
+		ctx->pid = NULL;
 		cxl_adapter_context_put(ctx->afu->adapter);
 		cxl_ctx_put();
+		if (task)
+			cxl_context_mm_count_put(ctx);
 		goto out;
 	}
 
diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index 062bf6ca2625..4472ce11f98d 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -17,6 +17,7 @@
 #include <linux/debugfs.h>
 #include <linux/slab.h>
 #include <linux/idr.h>
+#include <linux/sched/mm.h>
 #include <asm/cputable.h>
 #include <asm/current.h>
 #include <asm/copro.h>
@@ -38,23 +39,26 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master)
 {
 	int i;
 
-	spin_lock_init(&ctx->sste_lock);
 	ctx->afu = afu;
 	ctx->master = master;
-	ctx->pid = ctx->glpid = NULL; /* Set in start work ioctl */
+	ctx->pid = NULL; /* Set in start work ioctl */
 	mutex_init(&ctx->mapping_lock);
 	ctx->mapping = NULL;
 
-	/*
-	 * Allocate the segment table before we put it in the IDR so that we
-	 * can always access it when dereferenced from IDR. For the same
-	 * reason, the segment table is only destroyed after the context is
-	 * removed from the IDR.  Access to this in the IOCTL is protected by
-	 * Linux filesytem symantics (can't IOCTL until open is complete).
-	 */
-	i = cxl_alloc_sst(ctx);
-	if (i)
-		return i;
+	if (cxl_is_psl8(afu)) {
+		spin_lock_init(&ctx->sste_lock);
+
+		/*
+		 * Allocate the segment table before we put it in the IDR so that we
+		 * can always access it when dereferenced from IDR. For the same
+		 * reason, the segment table is only destroyed after the context is
+		 * removed from the IDR.  Access to this in the IOCTL is protected by
+		 * Linux filesytem symantics (can't IOCTL until open is complete).
+		 */
+		i = cxl_alloc_sst(ctx);
+		if (i)
+			return i;
+	}
 
 	INIT_WORK(&ctx->fault_work, cxl_handle_fault);
 
@@ -184,13 +188,26 @@ int cxl_context_iomap(struct cxl_context *ctx, struct vm_area_struct *vma)
 	if (ctx->afu->current_mode == CXL_MODE_DEDICATED) {
 		if (start + len > ctx->afu->adapter->ps_size)
 			return -EINVAL;
+
+		if (cxl_is_psl9(ctx->afu)) {
+			/*
+			 * Make sure there is a valid problem state
+			 * area space for this AFU.
+			 */
+			if (ctx->master && !ctx->afu->psa) {
+				pr_devel("AFU doesn't support mmio space\n");
+				return -EINVAL;
+			}
+
+			/* Can't mmap until the AFU is enabled */
+			if (!ctx->afu->enabled)
+				return -EBUSY;
+		}
 	} else {
 		if (start + len > ctx->psn_size)
 			return -EINVAL;
-	}
 
-	if (ctx->afu->current_mode != CXL_MODE_DEDICATED) {
-		/* make sure there is a valid per process space for this AFU */
+		/* Make sure there is a valid per process space for this AFU */
 		if ((ctx->master && !ctx->afu->psa) || (!ctx->afu->pp_psa)) {
 			pr_devel("AFU doesn't support mmio space\n");
 			return -EINVAL;
@@ -242,12 +259,16 @@ int __detach_context(struct cxl_context *ctx)
 
 	/* release the reference to the group leader and mm handling pid */
 	put_pid(ctx->pid);
-	put_pid(ctx->glpid);
 
 	cxl_ctx_put();
 
 	/* Decrease the attached context count on the adapter */
 	cxl_adapter_context_put(ctx->afu->adapter);
+
+	/* Decrease the mm count on the context */
+	cxl_context_mm_count_put(ctx);
+	ctx->mm = NULL;
+
 	return 0;
 }
 
@@ -303,7 +324,8 @@ static void reclaim_ctx(struct rcu_head *rcu)
 {
 	struct cxl_context *ctx = container_of(rcu, struct cxl_context, rcu);
 
-	free_page((u64)ctx->sstp);
+	if (cxl_is_psl8(ctx->afu))
+		free_page((u64)ctx->sstp);
 	if (ctx->ff_page)
 		__free_page(ctx->ff_page);
 	ctx->sstp = NULL;
@@ -325,3 +347,15 @@ void cxl_context_free(struct cxl_context *ctx)
 	mutex_unlock(&ctx->afu->contexts_lock);
 	call_rcu(&ctx->rcu, reclaim_ctx);
 }
+
+void cxl_context_mm_count_get(struct cxl_context *ctx)
+{
+	if (ctx->mm)
+		atomic_inc(&ctx->mm->mm_count);
+}
+
+void cxl_context_mm_count_put(struct cxl_context *ctx)
+{
+	if (ctx->mm)
+		mmdrop(ctx->mm);
+}
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index 79e60ec70bd3..c8568ea7c518 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -63,7 +63,7 @@ typedef struct {
 /* Memory maps. Ref CXL Appendix A */
 
 /* PSL Privilege 1 Memory Map */
-/* Configuration and Control area */
+/* Configuration and Control area - CAIA 1&2 */
 static const cxl_p1_reg_t CXL_PSL_CtxTime = {0x0000};
 static const cxl_p1_reg_t CXL_PSL_ErrIVTE = {0x0008};
 static const cxl_p1_reg_t CXL_PSL_KEY1    = {0x0010};
@@ -73,7 +73,7 @@ static const cxl_p1_reg_t CXL_PSL_Control = {0x0020};
 static const cxl_p1_reg_t CXL_PSL_DLCNTL  = {0x0060};
 static const cxl_p1_reg_t CXL_PSL_DLADDR  = {0x0068};
 
-/* PSL Lookaside Buffer Management Area */
+/* PSL Lookaside Buffer Management Area - CAIA 1 */
 static const cxl_p1_reg_t CXL_PSL_LBISEL  = {0x0080};
 static const cxl_p1_reg_t CXL_PSL_SLBIE   = {0x0088};
 static const cxl_p1_reg_t CXL_PSL_SLBIA   = {0x0090};
@@ -82,7 +82,7 @@ static const cxl_p1_reg_t CXL_PSL_TLBIA   = {0x00A8};
 static const cxl_p1_reg_t CXL_PSL_AFUSEL  = {0x00B0};
 
 /* 0x00C0:7EFF Implementation dependent area */
-/* PSL registers */
+/* PSL registers - CAIA 1 */
 static const cxl_p1_reg_t CXL_PSL_FIR1      = {0x0100};
 static const cxl_p1_reg_t CXL_PSL_FIR2      = {0x0108};
 static const cxl_p1_reg_t CXL_PSL_Timebase  = {0x0110};
@@ -98,61 +98,83 @@ static const cxl_p1_reg_t CXL_XSL_Timebase  = {0x0100};
 static const cxl_p1_reg_t CXL_XSL_TB_CTLSTAT = {0x0108};
 static const cxl_p1_reg_t CXL_XSL_FEC       = {0x0158};
 static const cxl_p1_reg_t CXL_XSL_DSNCTL    = {0x0168};
+/* PSL registers - CAIA 2 */
+static const cxl_p1_reg_t CXL_PSL9_CONTROL  = {0x0020};
+static const cxl_p1_reg_t CXL_XSL9_DSNCTL   = {0x0168};
+static const cxl_p1_reg_t CXL_PSL9_FIR1     = {0x0300};
+static const cxl_p1_reg_t CXL_PSL9_FIR2     = {0x0308};
+static const cxl_p1_reg_t CXL_PSL9_Timebase = {0x0310};
+static const cxl_p1_reg_t CXL_PSL9_DEBUG    = {0x0320};
+static const cxl_p1_reg_t CXL_PSL9_FIR_CNTL = {0x0348};
+static const cxl_p1_reg_t CXL_PSL9_DSNDCTL  = {0x0350};
+static const cxl_p1_reg_t CXL_PSL9_TB_CTLSTAT = {0x0340};
+static const cxl_p1_reg_t CXL_PSL9_TRACECFG = {0x0368};
+static const cxl_p1_reg_t CXL_PSL9_APCDEDALLOC = {0x0378};
+static const cxl_p1_reg_t CXL_PSL9_APCDEDTYPE = {0x0380};
+static const cxl_p1_reg_t CXL_PSL9_TNR_ADDR = {0x0388};
+static const cxl_p1_reg_t CXL_PSL9_GP_CT = {0x0398};
+static const cxl_p1_reg_t CXL_XSL9_IERAT = {0x0588};
+static const cxl_p1_reg_t CXL_XSL9_ILPP  = {0x0590};
+
 /* 0x7F00:7FFF Reserved PCIe MSI-X Pending Bit Array area */
 /* 0x8000:FFFF Reserved PCIe MSI-X Table Area */
 
 /* PSL Slice Privilege 1 Memory Map */
-/* Configuration Area */
+/* Configuration Area - CAIA 1&2 */
 static const cxl_p1n_reg_t CXL_PSL_SR_An          = {0x00};
 static const cxl_p1n_reg_t CXL_PSL_LPID_An        = {0x08};
 static const cxl_p1n_reg_t CXL_PSL_AMBAR_An       = {0x10};
 static const cxl_p1n_reg_t CXL_PSL_SPOffset_An    = {0x18};
 static const cxl_p1n_reg_t CXL_PSL_ID_An          = {0x20};
 static const cxl_p1n_reg_t CXL_PSL_SERR_An        = {0x28};
-/* Memory Management and Lookaside Buffer Management */
+/* Memory Management and Lookaside Buffer Management - CAIA 1*/
 static const cxl_p1n_reg_t CXL_PSL_SDR_An         = {0x30};
+/* Memory Management and Lookaside Buffer Management - CAIA 1&2 */
 static const cxl_p1n_reg_t CXL_PSL_AMOR_An        = {0x38};
-/* Pointer Area */
+/* Pointer Area - CAIA 1&2 */
 static const cxl_p1n_reg_t CXL_HAURP_An           = {0x80};
 static const cxl_p1n_reg_t CXL_PSL_SPAP_An        = {0x88};
 static const cxl_p1n_reg_t CXL_PSL_LLCMD_An       = {0x90};
-/* Control Area */
+/* Control Area - CAIA 1&2 */
 static const cxl_p1n_reg_t CXL_PSL_SCNTL_An       = {0xA0};
 static const cxl_p1n_reg_t CXL_PSL_CtxTime_An     = {0xA8};
 static const cxl_p1n_reg_t CXL_PSL_IVTE_Offset_An = {0xB0};
 static const cxl_p1n_reg_t CXL_PSL_IVTE_Limit_An  = {0xB8};
-/* 0xC0:FF Implementation Dependent Area */
+/* 0xC0:FF Implementation Dependent Area - CAIA 1&2 */
 static const cxl_p1n_reg_t CXL_PSL_FIR_SLICE_An   = {0xC0};
 static const cxl_p1n_reg_t CXL_AFU_DEBUG_An       = {0xC8};
+/* 0xC0:FF Implementation Dependent Area - CAIA 1 */
 static const cxl_p1n_reg_t CXL_PSL_APCALLOC_A     = {0xD0};
 static const cxl_p1n_reg_t CXL_PSL_COALLOC_A      = {0xD8};
 static const cxl_p1n_reg_t CXL_PSL_RXCTL_A        = {0xE0};
 static const cxl_p1n_reg_t CXL_PSL_SLICE_TRACE    = {0xE8};
 
 /* PSL Slice Privilege 2 Memory Map */
-/* Configuration and Control Area */
+/* Configuration and Control Area - CAIA 1&2 */
 static const cxl_p2n_reg_t CXL_PSL_PID_TID_An = {0x000};
 static const cxl_p2n_reg_t CXL_CSRP_An        = {0x008};
+/* Configuration and Control Area - CAIA 1 */
 static const cxl_p2n_reg_t CXL_AURP0_An       = {0x010};
 static const cxl_p2n_reg_t CXL_AURP1_An       = {0x018};
 static const cxl_p2n_reg_t CXL_SSTP0_An       = {0x020};
 static const cxl_p2n_reg_t CXL_SSTP1_An       = {0x028};
+/* Configuration and Control Area - CAIA 1 */
 static const cxl_p2n_reg_t CXL_PSL_AMR_An     = {0x030};
-/* Segment Lookaside Buffer Management */
+/* Segment Lookaside Buffer Management - CAIA 1 */
 static const cxl_p2n_reg_t CXL_SLBIE_An       = {0x040};
 static const cxl_p2n_reg_t CXL_SLBIA_An       = {0x048};
 static const cxl_p2n_reg_t CXL_SLBI_Select_An = {0x050};
-/* Interrupt Registers */
+/* Interrupt Registers - CAIA 1&2 */
 static const cxl_p2n_reg_t CXL_PSL_DSISR_An   = {0x060};
 static const cxl_p2n_reg_t CXL_PSL_DAR_An     = {0x068};
 static const cxl_p2n_reg_t CXL_PSL_DSR_An     = {0x070};
 static const cxl_p2n_reg_t CXL_PSL_TFC_An     = {0x078};
 static const cxl_p2n_reg_t CXL_PSL_PEHandle_An = {0x080};
 static const cxl_p2n_reg_t CXL_PSL_ErrStat_An = {0x088};
-/* AFU Registers */
+/* AFU Registers - CAIA 1&2 */
 static const cxl_p2n_reg_t CXL_AFU_Cntl_An    = {0x090};
 static const cxl_p2n_reg_t CXL_AFU_ERR_An     = {0x098};
-/* Work Element Descriptor */
+/* Work Element Descriptor - CAIA 1&2 */
 static const cxl_p2n_reg_t CXL_PSL_WED_An     = {0x0A0};
 /* 0x0C0:FFF Implementation Dependent Area */
 
@@ -179,6 +201,10 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An     = {0x0A0};
 #define CXL_PSL_SR_An_SF  MSR_SF            /* 64bit */
 #define CXL_PSL_SR_An_TA  (1ull << (63-1))  /* Tags active,   GA1: 0 */
 #define CXL_PSL_SR_An_HV  MSR_HV            /* Hypervisor,    GA1: 0 */
+#define CXL_PSL_SR_An_XLAT_hpt (0ull << (63-6))/* Hashed page table (HPT) mode */
+#define CXL_PSL_SR_An_XLAT_roh (2ull << (63-6))/* Radix on HPT mode */
+#define CXL_PSL_SR_An_XLAT_ror (3ull << (63-6))/* Radix on Radix mode */
+#define CXL_PSL_SR_An_BOT (1ull << (63-10)) /* Use the in-memory segment table */
 #define CXL_PSL_SR_An_PR  MSR_PR            /* Problem state, GA1: 1 */
 #define CXL_PSL_SR_An_ISL (1ull << (63-53)) /* Ignore Segment Large Page */
 #define CXL_PSL_SR_An_TC  (1ull << (63-54)) /* Page Table secondary hash */
@@ -202,6 +228,24 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An     = {0x0A0};
 #define CXL_PSL_SERR_An_llcmdto	(1ull << (63-6))
 #define CXL_PSL_SERR_An_afupar	(1ull << (63-7))
 #define CXL_PSL_SERR_An_afudup	(1ull << (63-8))
+#define CXL_PSL_SERR_An_IRQS	( \
+	CXL_PSL_SERR_An_afuto | CXL_PSL_SERR_An_afudis | CXL_PSL_SERR_An_afuov | \
+	CXL_PSL_SERR_An_badsrc | CXL_PSL_SERR_An_badctx | CXL_PSL_SERR_An_llcmdis | \
+	CXL_PSL_SERR_An_llcmdto | CXL_PSL_SERR_An_afupar | CXL_PSL_SERR_An_afudup)
+#define CXL_PSL_SERR_An_afuto_mask	(1ull << (63-32))
+#define CXL_PSL_SERR_An_afudis_mask	(1ull << (63-33))
+#define CXL_PSL_SERR_An_afuov_mask	(1ull << (63-34))
+#define CXL_PSL_SERR_An_badsrc_mask	(1ull << (63-35))
+#define CXL_PSL_SERR_An_badctx_mask	(1ull << (63-36))
+#define CXL_PSL_SERR_An_llcmdis_mask	(1ull << (63-37))
+#define CXL_PSL_SERR_An_llcmdto_mask	(1ull << (63-38))
+#define CXL_PSL_SERR_An_afupar_mask	(1ull << (63-39))
+#define CXL_PSL_SERR_An_afudup_mask	(1ull << (63-40))
+#define CXL_PSL_SERR_An_IRQ_MASKS	( \
+	CXL_PSL_SERR_An_afuto_mask | CXL_PSL_SERR_An_afudis_mask | CXL_PSL_SERR_An_afuov_mask | \
+	CXL_PSL_SERR_An_badsrc_mask | CXL_PSL_SERR_An_badctx_mask | CXL_PSL_SERR_An_llcmdis_mask | \
+	CXL_PSL_SERR_An_llcmdto_mask | CXL_PSL_SERR_An_afupar_mask | CXL_PSL_SERR_An_afudup_mask)
+
 #define CXL_PSL_SERR_An_AE	(1ull << (63-30))
 
 /****** CXL_PSL_SCNTL_An ****************************************************/
@@ -257,7 +301,7 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An     = {0x0A0};
 #define CXL_SSTP1_An_STVA_L_MASK (~((1ull << (63-55))-1))
 #define CXL_SSTP1_An_V              (1ull << (63-63))
 
-/****** CXL_PSL_SLBIE_[An] **************************************************/
+/****** CXL_PSL_SLBIE_[An] - CAIA 1 **************************************************/
 /* write: */
 #define CXL_SLBIE_C        PPC_BIT(36)         /* Class */
 #define CXL_SLBIE_SS       PPC_BITMASK(37, 38) /* Segment Size */
@@ -267,10 +311,10 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An     = {0x0A0};
 #define CXL_SLBIE_MAX      PPC_BITMASK(24, 31)
 #define CXL_SLBIE_PENDING  PPC_BITMASK(56, 63)
 
-/****** Common to all CXL_TLBIA/SLBIA_[An] **********************************/
+/****** Common to all CXL_TLBIA/SLBIA_[An] - CAIA 1 **********************************/
 #define CXL_TLB_SLB_P          (1ull) /* Pending (read) */
 
-/****** Common to all CXL_TLB/SLB_IA/IE_[An] registers **********************/
+/****** Common to all CXL_TLB/SLB_IA/IE_[An] registers - CAIA 1 **********************/
 #define CXL_TLB_SLB_IQ_ALL     (0ull) /* Inv qualifier */
 #define CXL_TLB_SLB_IQ_LPID    (1ull) /* Inv qualifier */
 #define CXL_TLB_SLB_IQ_LPIDPID (3ull) /* Inv qualifier */
@@ -278,7 +322,7 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An     = {0x0A0};
 /****** CXL_PSL_AFUSEL ******************************************************/
 #define CXL_PSL_AFUSEL_A (1ull << (63-55)) /* Adapter wide invalidates affect all AFUs */
 
-/****** CXL_PSL_DSISR_An ****************************************************/
+/****** CXL_PSL_DSISR_An - CAIA 1 ****************************************************/
 #define CXL_PSL_DSISR_An_DS (1ull << (63-0))  /* Segment not found */
 #define CXL_PSL_DSISR_An_DM (1ull << (63-1))  /* PTE not found (See also: M) or protection fault */
 #define CXL_PSL_DSISR_An_ST (1ull << (63-2))  /* Segment Table PTE not found */
@@ -295,12 +339,39 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An     = {0x0A0};
 #define CXL_PSL_DSISR_An_S  DSISR_ISSTORE     /* Access was afu_wr or afu_zero */
 #define CXL_PSL_DSISR_An_K  DSISR_KEYFAULT    /* Access not permitted by virtual page class key protection */
 
+/****** CXL_PSL_DSISR_An - CAIA 2 ****************************************************/
+#define CXL_PSL9_DSISR_An_TF (1ull << (63-3))  /* Translation fault */
+#define CXL_PSL9_DSISR_An_PE (1ull << (63-4))  /* PSL Error (implementation specific) */
+#define CXL_PSL9_DSISR_An_AE (1ull << (63-5))  /* AFU Error */
+#define CXL_PSL9_DSISR_An_OC (1ull << (63-6))  /* OS Context Warning */
+#define CXL_PSL9_DSISR_An_S (1ull << (63-38))  /* TF for a write operation */
+#define CXL_PSL9_DSISR_PENDING (CXL_PSL9_DSISR_An_TF | CXL_PSL9_DSISR_An_PE | CXL_PSL9_DSISR_An_AE | CXL_PSL9_DSISR_An_OC)
+/*
+ * NOTE: Bits 56:63 (Checkout Response Status) are valid when DSISR_An[TF] = 1
+ * Status (0:7) Encoding
+ */
+#define CXL_PSL9_DSISR_An_CO_MASK 0x00000000000000ffULL
+#define CXL_PSL9_DSISR_An_SF      0x0000000000000080ULL  /* Segment Fault                        0b10000000 */
+#define CXL_PSL9_DSISR_An_PF_SLR  0x0000000000000088ULL  /* PTE not found (Single Level Radix)   0b10001000 */
+#define CXL_PSL9_DSISR_An_PF_RGC  0x000000000000008CULL  /* PTE not found (Radix Guest (child))  0b10001100 */
+#define CXL_PSL9_DSISR_An_PF_RGP  0x0000000000000090ULL  /* PTE not found (Radix Guest (parent)) 0b10010000 */
+#define CXL_PSL9_DSISR_An_PF_HRH  0x0000000000000094ULL  /* PTE not found (HPT/Radix Host)       0b10010100 */
+#define CXL_PSL9_DSISR_An_PF_STEG 0x000000000000009CULL  /* PTE not found (STEG VA)              0b10011100 */
+
 /****** CXL_PSL_TFC_An ******************************************************/
 #define CXL_PSL_TFC_An_A  (1ull << (63-28)) /* Acknowledge non-translation fault */
 #define CXL_PSL_TFC_An_C  (1ull << (63-29)) /* Continue (abort transaction) */
 #define CXL_PSL_TFC_An_AE (1ull << (63-30)) /* Restart PSL with address error */
 #define CXL_PSL_TFC_An_R  (1ull << (63-31)) /* Restart PSL transaction */
 
+/****** CXL_XSL9_IERAT_ERAT - CAIA 2 **********************************/
+#define CXL_XSL9_IERAT_MLPID    (1ull << (63-0))  /* Match LPID */
+#define CXL_XSL9_IERAT_MPID     (1ull << (63-1))  /* Match PID */
+#define CXL_XSL9_IERAT_PRS      (1ull << (63-4))  /* PRS bit for Radix invalidations */
+#define CXL_XSL9_IERAT_INVR     (1ull << (63-3))  /* Invalidate Radix */
+#define CXL_XSL9_IERAT_IALL     (1ull << (63-8))  /* Invalidate All */
+#define CXL_XSL9_IERAT_IINPROG  (1ull << (63-63)) /* Invalidate in progress */
+
 /* cxl_process_element->software_status */
 #define CXL_PE_SOFTWARE_STATE_V (1ul << (31 -  0)) /* Valid */
 #define CXL_PE_SOFTWARE_STATE_C (1ul << (31 - 29)) /* Complete */
@@ -482,8 +553,6 @@ struct cxl_context {
 	unsigned int sst_size, sst_lru;
 
 	wait_queue_head_t wq;
-	/* pid of the group leader associated with the pid */
-	struct pid *glpid;
 	/* use mm context associated with this pid for ds faults */
 	struct pid *pid;
 	spinlock_t lock; /* Protects pending_irq_mask, pending_fault and fault_addr */
@@ -551,15 +620,27 @@ struct cxl_context {
 	 * CX4 only:
 	 */
 	struct list_head extra_irq_contexts;
+
+	struct mm_struct *mm;
 };
 
+struct cxl_irq_info;
+
 struct cxl_service_layer_ops {
 	int (*adapter_regs_init)(struct cxl *adapter, struct pci_dev *dev);
+	int (*invalidate_all)(struct cxl *adapter);
 	int (*afu_regs_init)(struct cxl_afu *afu);
+	int (*sanitise_afu_regs)(struct cxl_afu *afu);
 	int (*register_serr_irq)(struct cxl_afu *afu);
 	void (*release_serr_irq)(struct cxl_afu *afu);
-	void (*debugfs_add_adapter_sl_regs)(struct cxl *adapter, struct dentry *dir);
-	void (*debugfs_add_afu_sl_regs)(struct cxl_afu *afu, struct dentry *dir);
+	irqreturn_t (*handle_interrupt)(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info);
+	irqreturn_t (*fail_irq)(struct cxl_afu *afu, struct cxl_irq_info *irq_info);
+	int (*activate_dedicated_process)(struct cxl_afu *afu);
+	int (*attach_afu_directed)(struct cxl_context *ctx, u64 wed, u64 amr);
+	int (*attach_dedicated_process)(struct cxl_context *ctx, u64 wed, u64 amr);
+	void (*update_dedicated_ivtes)(struct cxl_context *ctx);
+	void (*debugfs_add_adapter_regs)(struct cxl *adapter, struct dentry *dir);
+	void (*debugfs_add_afu_regs)(struct cxl_afu *afu, struct dentry *dir);
 	void (*psl_irq_dump_registers)(struct cxl_context *ctx);
 	void (*err_irq_dump_registers)(struct cxl *adapter);
 	void (*debugfs_stop_trace)(struct cxl *adapter);
@@ -641,25 +722,38 @@ int cxl_pci_reset(struct cxl *adapter);
 void cxl_pci_release_afu(struct device *dev);
 ssize_t cxl_pci_read_adapter_vpd(struct cxl *adapter, void *buf, size_t len);
 
-/* common == phyp + powernv */
+/* common == phyp + powernv - CAIA 1&2 */
 struct cxl_process_element_common {
 	__be32 tid;
 	__be32 pid;
 	__be64 csrp;
-	__be64 aurp0;
-	__be64 aurp1;
-	__be64 sstp0;
-	__be64 sstp1;
+	union {
+		struct {
+			__be64 aurp0;
+			__be64 aurp1;
+			__be64 sstp0;
+			__be64 sstp1;
+		} psl8;  /* CAIA 1 */
+		struct {
+			u8     reserved2[8];
+			u8     reserved3[8];
+			u8     reserved4[8];
+			u8     reserved5[8];
+		} psl9;  /* CAIA 2 */
+	} u;
 	__be64 amr;
-	u8     reserved3[4];
+	u8     reserved6[4];
 	__be64 wed;
 } __packed;
 
-/* just powernv */
+/* just powernv - CAIA 1&2 */
 struct cxl_process_element {
 	__be64 sr;
 	__be64 SPOffset;
-	__be64 sdr;
+	union {
+		__be64 sdr;          /* CAIA 1 */
+		u8     reserved1[8]; /* CAIA 2 */
+	} u;
 	__be64 haurp;
 	__be32 ctxtime;
 	__be16 ivte_offsets[4];
@@ -739,6 +833,39 @@ static inline u64 cxl_p2n_read(struct cxl_afu *afu, cxl_p2n_reg_t reg)
 		return ~0ULL;
 }
 
+static inline bool cxl_is_power8(void)
+{
+	if ((pvr_version_is(PVR_POWER8E)) ||
+	    (pvr_version_is(PVR_POWER8NVL)) ||
+	    (pvr_version_is(PVR_POWER8)))
+		return true;
+	return false;
+}
+
+static inline bool cxl_is_power9(void)
+{
+	/* intermediate solution */
+	if (!cxl_is_power8() &&
+	   (cpu_has_feature(CPU_FTRS_POWER9) ||
+	    cpu_has_feature(CPU_FTR_POWER9_DD1)))
+		return true;
+	return false;
+}
+
+static inline bool cxl_is_psl8(struct cxl_afu *afu)
+{
+	if (afu->adapter->caia_major == 1)
+		return true;
+	return false;
+}
+
+static inline bool cxl_is_psl9(struct cxl_afu *afu)
+{
+	if (afu->adapter->caia_major == 2)
+		return true;
+	return false;
+}
+
 ssize_t cxl_pci_afu_read_err_buffer(struct cxl_afu *afu, char *buf,
 				loff_t off, size_t count);
 
@@ -765,7 +892,6 @@ int cxl_update_properties(struct device_node *dn, struct property *new_prop);
 
 void cxl_remove_adapter_nr(struct cxl *adapter);
 
-int cxl_alloc_spa(struct cxl_afu *afu);
 void cxl_release_spa(struct cxl_afu *afu);
 
 dev_t cxl_get_dev(void);
@@ -803,6 +929,15 @@ int afu_register_irqs(struct cxl_context *ctx, u32 count);
 void afu_release_irqs(struct cxl_context *ctx, void *cookie);
 void afu_irq_name_free(struct cxl_context *ctx);
 
+int cxl_attach_afu_directed_psl9(struct cxl_context *ctx, u64 wed, u64 amr);
+int cxl_attach_afu_directed_psl8(struct cxl_context *ctx, u64 wed, u64 amr);
+int cxl_activate_dedicated_process_psl9(struct cxl_afu *afu);
+int cxl_activate_dedicated_process_psl8(struct cxl_afu *afu);
+int cxl_attach_dedicated_process_psl9(struct cxl_context *ctx, u64 wed, u64 amr);
+int cxl_attach_dedicated_process_psl8(struct cxl_context *ctx, u64 wed, u64 amr);
+void cxl_update_dedicated_ivtes_psl9(struct cxl_context *ctx);
+void cxl_update_dedicated_ivtes_psl8(struct cxl_context *ctx);
+
 #ifdef CONFIG_DEBUG_FS
 
 int cxl_debugfs_init(void);
@@ -811,10 +946,13 @@ int cxl_debugfs_adapter_add(struct cxl *adapter);
 void cxl_debugfs_adapter_remove(struct cxl *adapter);
 int cxl_debugfs_afu_add(struct cxl_afu *afu);
 void cxl_debugfs_afu_remove(struct cxl_afu *afu);
-void cxl_stop_trace(struct cxl *cxl);
-void cxl_debugfs_add_adapter_psl_regs(struct cxl *adapter, struct dentry *dir);
-void cxl_debugfs_add_adapter_xsl_regs(struct cxl *adapter, struct dentry *dir);
-void cxl_debugfs_add_afu_psl_regs(struct cxl_afu *afu, struct dentry *dir);
+void cxl_stop_trace_psl9(struct cxl *cxl);
+void cxl_stop_trace_psl8(struct cxl *cxl);
+void cxl_debugfs_add_adapter_regs_psl9(struct cxl *adapter, struct dentry *dir);
+void cxl_debugfs_add_adapter_regs_psl8(struct cxl *adapter, struct dentry *dir);
+void cxl_debugfs_add_adapter_regs_xsl(struct cxl *adapter, struct dentry *dir);
+void cxl_debugfs_add_afu_regs_psl9(struct cxl_afu *afu, struct dentry *dir);
+void cxl_debugfs_add_afu_regs_psl8(struct cxl_afu *afu, struct dentry *dir);
 
 #else /* CONFIG_DEBUG_FS */
 
@@ -845,21 +983,34 @@ static inline void cxl_debugfs_afu_remove(struct cxl_afu *afu)
 {
 }
 
-static inline void cxl_stop_trace(struct cxl *cxl)
+static inline void cxl_stop_trace_psl9(struct cxl *cxl)
 {
 }
 
-static inline void cxl_debugfs_add_adapter_psl_regs(struct cxl *adapter,
+static inline void cxl_stop_trace_psl8(struct cxl *cxl)
+{
+}
+
+static inline void cxl_debugfs_add_adapter_regs_psl9(struct cxl *adapter,
 						    struct dentry *dir)
 {
 }
 
-static inline void cxl_debugfs_add_adapter_xsl_regs(struct cxl *adapter,
+static inline void cxl_debugfs_add_adapter_regs_psl8(struct cxl *adapter,
 						    struct dentry *dir)
 {
 }
 
-static inline void cxl_debugfs_add_afu_psl_regs(struct cxl_afu *afu, struct dentry *dir)
+static inline void cxl_debugfs_add_adapter_regs_xsl(struct cxl *adapter,
+						    struct dentry *dir)
+{
+}
+
+static inline void cxl_debugfs_add_afu_regs_psl9(struct cxl_afu *afu, struct dentry *dir)
+{
+}
+
+static inline void cxl_debugfs_add_afu_regs_psl8(struct cxl_afu *afu, struct dentry *dir)
 {
 }
 
@@ -888,27 +1039,15 @@ int __detach_context(struct cxl_context *ctx);
 /*
  * This must match the layout of the H_COLLECT_CA_INT_INFO retbuf defined
  * in PAPR.
- * A word about endianness: a pointer to this structure is passed when
- * calling the hcall. However, it is not a block of memory filled up by
- * the hypervisor. The return values are found in registers, and copied
- * one by one when returning from the hcall. See the end of the call to
- * plpar_hcall9() in hvCall.S
- * As a consequence:
- * - we don't need to do any endianness conversion
- * - the pid and tid are an exception. They are 32-bit values returned in
- *   the same 64-bit register. So we do need to worry about byte ordering.
+ * Field pid_tid is now 'reserved' because it's no more used on bare-metal.
+ * On a guest environment, PSL_PID_An is located on the upper 32 bits and
+ * PSL_TID_An register in the lower 32 bits.
  */
 struct cxl_irq_info {
 	u64 dsisr;
 	u64 dar;
 	u64 dsr;
-#ifndef CONFIG_CPU_LITTLE_ENDIAN
-	u32 pid;
-	u32 tid;
-#else
-	u32 tid;
-	u32 pid;
-#endif
+	u64 reserved;
 	u64 afu_err;
 	u64 errstat;
 	u64 proc_handle;
@@ -916,19 +1055,23 @@ struct cxl_irq_info {
 };
 
 void cxl_assign_psn_space(struct cxl_context *ctx);
-irqreturn_t cxl_irq(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info);
+int cxl_invalidate_all_psl9(struct cxl *adapter);
+int cxl_invalidate_all_psl8(struct cxl *adapter);
+irqreturn_t cxl_irq_psl9(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info);
+irqreturn_t cxl_irq_psl8(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info);
+irqreturn_t cxl_fail_irq_psl(struct cxl_afu *afu, struct cxl_irq_info *irq_info);
 int cxl_register_one_irq(struct cxl *adapter, irq_handler_t handler,
 			void *cookie, irq_hw_number_t *dest_hwirq,
 			unsigned int *dest_virq, const char *name);
 
 int cxl_check_error(struct cxl_afu *afu);
 int cxl_afu_slbia(struct cxl_afu *afu);
-int cxl_tlb_slb_invalidate(struct cxl *adapter);
 int cxl_data_cache_flush(struct cxl *adapter);
 int cxl_afu_disable(struct cxl_afu *afu);
 int cxl_psl_purge(struct cxl_afu *afu);
 
-void cxl_native_psl_irq_dump_regs(struct cxl_context *ctx);
+void cxl_native_irq_dump_regs_psl9(struct cxl_context *ctx);
+void cxl_native_irq_dump_regs_psl8(struct cxl_context *ctx);
 void cxl_native_err_irq_dump_regs(struct cxl *adapter);
 int cxl_pci_vphb_add(struct cxl_afu *afu);
 void cxl_pci_vphb_remove(struct cxl_afu *afu);
@@ -1024,4 +1167,10 @@ int cxl_adapter_context_lock(struct cxl *adapter);
 /* Unlock the contexts-lock if taken. Warn and force unlock otherwise */
 void cxl_adapter_context_unlock(struct cxl *adapter);
 
+/* Increases the reference count to "struct mm_struct" */
+void cxl_context_mm_count_get(struct cxl_context *ctx);
+
+/* Decrements the reference count to "struct mm_struct" */
+void cxl_context_mm_count_put(struct cxl_context *ctx);
+
 #endif
diff --git a/drivers/misc/cxl/debugfs.c b/drivers/misc/cxl/debugfs.c
index 9c06ac8fa5ac..eae9d749f967 100644
--- a/drivers/misc/cxl/debugfs.c
+++ b/drivers/misc/cxl/debugfs.c
@@ -15,7 +15,13 @@
 
 static struct dentry *cxl_debugfs;
 
-void cxl_stop_trace(struct cxl *adapter)
+void cxl_stop_trace_psl9(struct cxl *adapter)
+{
+	/* Stop the trace */
+	cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x4480000000000000ULL);
+}
+
+void cxl_stop_trace_psl8(struct cxl *adapter)
 {
 	int slice;
 
@@ -53,7 +59,15 @@ static struct dentry *debugfs_create_io_x64(const char *name, umode_t mode,
 					  (void __force *)value, &fops_io_x64);
 }
 
-void cxl_debugfs_add_adapter_psl_regs(struct cxl *adapter, struct dentry *dir)
+void cxl_debugfs_add_adapter_regs_psl9(struct cxl *adapter, struct dentry *dir)
+{
+	debugfs_create_io_x64("fir1", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL9_FIR1));
+	debugfs_create_io_x64("fir2", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL9_FIR2));
+	debugfs_create_io_x64("fir_cntl", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL9_FIR_CNTL));
+	debugfs_create_io_x64("trace", S_IRUSR | S_IWUSR, dir, _cxl_p1_addr(adapter, CXL_PSL9_TRACECFG));
+}
+
+void cxl_debugfs_add_adapter_regs_psl8(struct cxl *adapter, struct dentry *dir)
 {
 	debugfs_create_io_x64("fir1", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL_FIR1));
 	debugfs_create_io_x64("fir2", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL_FIR2));
@@ -61,7 +75,7 @@ void cxl_debugfs_add_adapter_psl_regs(struct cxl *adapter, struct dentry *dir)
 	debugfs_create_io_x64("trace", S_IRUSR | S_IWUSR, dir, _cxl_p1_addr(adapter, CXL_PSL_TRACE));
 }
 
-void cxl_debugfs_add_adapter_xsl_regs(struct cxl *adapter, struct dentry *dir)
+void cxl_debugfs_add_adapter_regs_xsl(struct cxl *adapter, struct dentry *dir)
 {
 	debugfs_create_io_x64("fec", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_XSL_FEC));
 }
@@ -82,8 +96,8 @@ int cxl_debugfs_adapter_add(struct cxl *adapter)
 
 	debugfs_create_io_x64("err_ivte", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL_ErrIVTE));
 
-	if (adapter->native->sl_ops->debugfs_add_adapter_sl_regs)
-		adapter->native->sl_ops->debugfs_add_adapter_sl_regs(adapter, dir);
+	if (adapter->native->sl_ops->debugfs_add_adapter_regs)
+		adapter->native->sl_ops->debugfs_add_adapter_regs(adapter, dir);
 	return 0;
 }
 
@@ -92,8 +106,16 @@ void cxl_debugfs_adapter_remove(struct cxl *adapter)
 	debugfs_remove_recursive(adapter->debugfs);
 }
 
-void cxl_debugfs_add_afu_psl_regs(struct cxl_afu *afu, struct dentry *dir)
+void cxl_debugfs_add_afu_regs_psl9(struct cxl_afu *afu, struct dentry *dir)
 {
+	debugfs_create_io_x64("serr", S_IRUSR, dir, _cxl_p1n_addr(afu, CXL_PSL_SERR_An));
+}
+
+void cxl_debugfs_add_afu_regs_psl8(struct cxl_afu *afu, struct dentry *dir)
+{
+	debugfs_create_io_x64("sstp0", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_SSTP0_An));
+	debugfs_create_io_x64("sstp1", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_SSTP1_An));
+
 	debugfs_create_io_x64("fir", S_IRUSR, dir, _cxl_p1n_addr(afu, CXL_PSL_FIR_SLICE_An));
 	debugfs_create_io_x64("serr", S_IRUSR, dir, _cxl_p1n_addr(afu, CXL_PSL_SERR_An));
 	debugfs_create_io_x64("afu_debug", S_IRUSR, dir, _cxl_p1n_addr(afu, CXL_AFU_DEBUG_An));
@@ -117,12 +139,11 @@ int cxl_debugfs_afu_add(struct cxl_afu *afu)
 	debugfs_create_io_x64("sr",         S_IRUSR, dir, _cxl_p1n_addr(afu, CXL_PSL_SR_An));
 	debugfs_create_io_x64("dsisr",      S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_PSL_DSISR_An));
 	debugfs_create_io_x64("dar",        S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_PSL_DAR_An));
-	debugfs_create_io_x64("sstp0",      S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_SSTP0_An));
-	debugfs_create_io_x64("sstp1",      S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_SSTP1_An));
+
 	debugfs_create_io_x64("err_status", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_PSL_ErrStat_An));
 
-	if (afu->adapter->native->sl_ops->debugfs_add_afu_sl_regs)
-		afu->adapter->native->sl_ops->debugfs_add_afu_sl_regs(afu, dir);
+	if (afu->adapter->native->sl_ops->debugfs_add_afu_regs)
+		afu->adapter->native->sl_ops->debugfs_add_afu_regs(afu, dir);
 
 	return 0;
 }
diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c
index 2fa015c05561..5344448f514e 100644
--- a/drivers/misc/cxl/fault.c
+++ b/drivers/misc/cxl/fault.c
@@ -146,108 +146,67 @@ static void cxl_handle_page_fault(struct cxl_context *ctx,
 		return cxl_ack_ae(ctx);
 	}
 
-	/*
-	 * update_mmu_cache() will not have loaded the hash since current->trap
-	 * is not a 0x400 or 0x300, so just call hash_page_mm() here.
-	 */
-	access = _PAGE_PRESENT | _PAGE_READ;
-	if (dsisr & CXL_PSL_DSISR_An_S)
-		access |= _PAGE_WRITE;
-
-	access |= _PAGE_PRIVILEGED;
-	if ((!ctx->kernel) || (REGION_ID(dar) == USER_REGION_ID))
-		access &= ~_PAGE_PRIVILEGED;
-
-	if (dsisr & DSISR_NOHPTE)
-		inv_flags |= HPTE_NOHPTE_UPDATE;
-
-	local_irq_save(flags);
-	hash_page_mm(mm, dar, access, 0x300, inv_flags);
-	local_irq_restore(flags);
-
+	if (!radix_enabled()) {
+		/*
+		 * update_mmu_cache() will not have loaded the hash since current->trap
+		 * is not a 0x400 or 0x300, so just call hash_page_mm() here.
+		 */
+		access = _PAGE_PRESENT | _PAGE_READ;
+		if (dsisr & CXL_PSL_DSISR_An_S)
+			access |= _PAGE_WRITE;
+
+		access |= _PAGE_PRIVILEGED;
+		if ((!ctx->kernel) || (REGION_ID(dar) == USER_REGION_ID))
+			access &= ~_PAGE_PRIVILEGED;
+
+		if (dsisr & DSISR_NOHPTE)
+			inv_flags |= HPTE_NOHPTE_UPDATE;
+
+		local_irq_save(flags);
+		hash_page_mm(mm, dar, access, 0x300, inv_flags);
+		local_irq_restore(flags);
+	}
 	pr_devel("Page fault successfully handled for pe: %i!\n", ctx->pe);
 	cxl_ops->ack_irq(ctx, CXL_PSL_TFC_An_R, 0);
 }
 
 /*
- * Returns the mm_struct corresponding to the context ctx via ctx->pid
- * In case the task has exited we use the task group leader accessible
- * via ctx->glpid to find the next task in the thread group that has a
- * valid  mm_struct associated with it. If a task with valid mm_struct
- * is found the ctx->pid is updated to use the task struct for subsequent
- * translations. In case no valid mm_struct is found in the task group to
- * service the fault a NULL is returned.
+ * Returns the mm_struct corresponding to the context ctx.
+ * mm_users == 0, the context may be in the process of being closed.
  */
 static struct mm_struct *get_mem_context(struct cxl_context *ctx)
 {
-	struct task_struct *task = NULL;
-	struct mm_struct *mm = NULL;
-	struct pid *old_pid = ctx->pid;
-
-	if (old_pid == NULL) {
-		pr_warn("%s: Invalid context for pe=%d\n",
-			 __func__, ctx->pe);
+	if (ctx->mm == NULL)
 		return NULL;
-	}
 
-	task = get_pid_task(old_pid, PIDTYPE_PID);
+	if (!atomic_inc_not_zero(&ctx->mm->mm_users))
+		return NULL;
 
-	/*
-	 * pid_alive may look racy but this saves us from costly
-	 * get_task_mm when the task is a zombie. In worst case
-	 * we may think a task is alive, which is about to die
-	 * but get_task_mm will return NULL.
-	 */
-	if (task != NULL && pid_alive(task))
-		mm = get_task_mm(task);
+	return ctx->mm;
+}
 
-	/* release the task struct that was taken earlier */
-	if (task)
-		put_task_struct(task);
-	else
-		pr_devel("%s: Context owning pid=%i for pe=%i dead\n",
-			__func__, pid_nr(old_pid), ctx->pe);
-
-	/*
-	 * If we couldn't find the mm context then use the group
-	 * leader to iterate over the task group and find a task
-	 * that gives us mm_struct.
-	 */
-	if (unlikely(mm == NULL && ctx->glpid != NULL)) {
-
-		rcu_read_lock();
-		task = pid_task(ctx->glpid, PIDTYPE_PID);
-		if (task)
-			do {
-				mm = get_task_mm(task);
-				if (mm) {
-					ctx->pid = get_task_pid(task,
-								PIDTYPE_PID);
-					break;
-				}
-				task = next_thread(task);
-			} while (task && !thread_group_leader(task));
-		rcu_read_unlock();
-
-		/* check if we switched pid */
-		if (ctx->pid != old_pid) {
-			if (mm)
-				pr_devel("%s:pe=%i switch pid %i->%i\n",
-					 __func__, ctx->pe, pid_nr(old_pid),
-					 pid_nr(ctx->pid));
-			else
-				pr_devel("%s:Cannot find mm for pid=%i\n",
-					 __func__, pid_nr(old_pid));
-
-			/* drop the reference to older pid */
-			put_pid(old_pid);
-		}
-	}
+static bool cxl_is_segment_miss(struct cxl_context *ctx, u64 dsisr)
+{
+	if ((cxl_is_psl8(ctx->afu)) && (dsisr & CXL_PSL_DSISR_An_DS))
+		return true;
 
-	return mm;
+	return false;
 }
 
+static bool cxl_is_page_fault(struct cxl_context *ctx, u64 dsisr)
+{
+	if ((cxl_is_psl8(ctx->afu)) && (dsisr & CXL_PSL_DSISR_An_DM))
+		return true;
+
+	if ((cxl_is_psl9(ctx->afu)) &&
+	   ((dsisr & CXL_PSL9_DSISR_An_CO_MASK) &
+		(CXL_PSL9_DSISR_An_PF_SLR | CXL_PSL9_DSISR_An_PF_RGC |
+		 CXL_PSL9_DSISR_An_PF_RGP | CXL_PSL9_DSISR_An_PF_HRH |
+		 CXL_PSL9_DSISR_An_PF_STEG)))
+		return true;
 
+	return false;
+}
 
 void cxl_handle_fault(struct work_struct *fault_work)
 {
@@ -282,7 +241,6 @@ void cxl_handle_fault(struct work_struct *fault_work)
 	if (!ctx->kernel) {
 
 		mm = get_mem_context(ctx);
-		/* indicates all the thread in task group have exited */
 		if (mm == NULL) {
 			pr_devel("%s: unable to get mm for pe=%d pid=%i\n",
 				 __func__, ctx->pe, pid_nr(ctx->pid));
@@ -294,9 +252,9 @@ void cxl_handle_fault(struct work_struct *fault_work)
 		}
 	}
 
-	if (dsisr & CXL_PSL_DSISR_An_DS)
+	if (cxl_is_segment_miss(ctx, dsisr))
 		cxl_handle_segment_miss(ctx, mm, dar);
-	else if (dsisr & CXL_PSL_DSISR_An_DM)
+	else if (cxl_is_page_fault(ctx, dsisr))
 		cxl_handle_page_fault(ctx, mm, dsisr, dar);
 	else
 		WARN(1, "cxl_handle_fault has nothing to handle\n");
diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c
index e7139c76f961..17b433f1ce23 100644
--- a/drivers/misc/cxl/file.c
+++ b/drivers/misc/cxl/file.c
@@ -18,6 +18,7 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include <linux/sched/mm.h>
 #include <asm/cputable.h>
 #include <asm/current.h>
 #include <asm/copro.h>
@@ -216,8 +217,16 @@ static long afu_ioctl_start_work(struct cxl_context *ctx,
 	 * process is still accessible.
 	 */
 	ctx->pid = get_task_pid(current, PIDTYPE_PID);
-	ctx->glpid = get_task_pid(current->group_leader, PIDTYPE_PID);
 
+	/* acquire a reference to the task's mm */
+	ctx->mm = get_task_mm(current);
+
+	/* ensure this mm_struct can't be freed */
+	cxl_context_mm_count_get(ctx);
+
+	/* decrement the use count */
+	if (ctx->mm)
+		mmput(ctx->mm);
 
 	trace_cxl_attach(ctx, work.work_element_descriptor, work.num_interrupts, amr);
 
@@ -225,9 +234,9 @@ static long afu_ioctl_start_work(struct cxl_context *ctx,
 							amr))) {
 		afu_release_irqs(ctx, ctx);
 		cxl_adapter_context_put(ctx->afu->adapter);
-		put_pid(ctx->glpid);
 		put_pid(ctx->pid);
-		ctx->glpid = ctx->pid = NULL;
+		ctx->pid = NULL;
+		cxl_context_mm_count_put(ctx);
 		goto out;
 	}
 
diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c
index e04bc4ddfd74..f58b4b6c79f2 100644
--- a/drivers/misc/cxl/guest.c
+++ b/drivers/misc/cxl/guest.c
@@ -169,7 +169,7 @@ static irqreturn_t guest_psl_irq(int irq, void *data)
 		return IRQ_HANDLED;
 	}
 
-	rc = cxl_irq(irq, ctx, &irq_info);
+	rc = cxl_irq_psl8(irq, ctx, &irq_info);
 	return rc;
 }
 
@@ -551,13 +551,13 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr)
 	elem->common.tid    = cpu_to_be32(0); /* Unused */
 	elem->common.pid    = cpu_to_be32(pid);
 	elem->common.csrp   = cpu_to_be64(0); /* disable */
-	elem->common.aurp0  = cpu_to_be64(0); /* disable */
-	elem->common.aurp1  = cpu_to_be64(0); /* disable */
+	elem->common.u.psl8.aurp0  = cpu_to_be64(0); /* disable */
+	elem->common.u.psl8.aurp1  = cpu_to_be64(0); /* disable */
 
 	cxl_prefault(ctx, wed);
 
-	elem->common.sstp0  = cpu_to_be64(ctx->sstp0);
-	elem->common.sstp1  = cpu_to_be64(ctx->sstp1);
+	elem->common.u.psl8.sstp0  = cpu_to_be64(ctx->sstp0);
+	elem->common.u.psl8.sstp1  = cpu_to_be64(ctx->sstp1);
 
 	/*
 	 * Ensure we have at least one interrupt allocated to take faults for
diff --git a/drivers/misc/cxl/hcalls.c b/drivers/misc/cxl/hcalls.c
index d6d11f4056d7..9b8bb0f80c3b 100644
--- a/drivers/misc/cxl/hcalls.c
+++ b/drivers/misc/cxl/hcalls.c
@@ -413,9 +413,9 @@ long cxl_h_collect_int_info(u64 unit_address, u64 process_token,
 
 	switch (rc) {
 	case H_SUCCESS:     /* The interrupt info is returned in return registers. */
-		pr_devel("dsisr:%#llx, dar:%#llx, dsr:%#llx, pid:%u, tid:%u, afu_err:%#llx, errstat:%#llx\n",
-			info->dsisr, info->dar, info->dsr, info->pid,
-			info->tid, info->afu_err, info->errstat);
+		pr_devel("dsisr:%#llx, dar:%#llx, dsr:%#llx, pid_tid:%#llx, afu_err:%#llx, errstat:%#llx\n",
+			info->dsisr, info->dar, info->dsr, info->reserved,
+			info->afu_err, info->errstat);
 		return 0;
 	case H_PARAMETER:   /* An incorrect parameter was supplied. */
 		return -EINVAL;
diff --git a/drivers/misc/cxl/irq.c b/drivers/misc/cxl/irq.c
index 1a402bbed687..ce08a9f22308 100644
--- a/drivers/misc/cxl/irq.c
+++ b/drivers/misc/cxl/irq.c
@@ -34,7 +34,58 @@ static irqreturn_t schedule_cxl_fault(struct cxl_context *ctx, u64 dsisr, u64 da
 	return IRQ_HANDLED;
 }
 
-irqreturn_t cxl_irq(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info)
+irqreturn_t cxl_irq_psl9(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info)
+{
+	u64 dsisr, dar;
+
+	dsisr = irq_info->dsisr;
+	dar = irq_info->dar;
+
+	trace_cxl_psl9_irq(ctx, irq, dsisr, dar);
+
+	pr_devel("CXL interrupt %i for afu pe: %i DSISR: %#llx DAR: %#llx\n", irq, ctx->pe, dsisr, dar);
+
+	if (dsisr & CXL_PSL9_DSISR_An_TF) {
+		pr_devel("CXL interrupt: Scheduling translation fault handling for later (pe: %i)\n", ctx->pe);
+		return schedule_cxl_fault(ctx, dsisr, dar);
+	}
+
+	if (dsisr & CXL_PSL9_DSISR_An_PE)
+		return cxl_ops->handle_psl_slice_error(ctx, dsisr,
+						irq_info->errstat);
+	if (dsisr & CXL_PSL9_DSISR_An_AE) {
+		pr_devel("CXL interrupt: AFU Error 0x%016llx\n", irq_info->afu_err);
+
+		if (ctx->pending_afu_err) {
+			/*
+			 * This shouldn't happen - the PSL treats these errors
+			 * as fatal and will have reset the AFU, so there's not
+			 * much point buffering multiple AFU errors.
+			 * OTOH if we DO ever see a storm of these come in it's
+			 * probably best that we log them somewhere:
+			 */
+			dev_err_ratelimited(&ctx->afu->dev, "CXL AFU Error undelivered to pe %i: 0x%016llx\n",
+					    ctx->pe, irq_info->afu_err);
+		} else {
+			spin_lock(&ctx->lock);
+			ctx->afu_err = irq_info->afu_err;
+			ctx->pending_afu_err = 1;
+			spin_unlock(&ctx->lock);
+
+			wake_up_all(&ctx->wq);
+		}
+
+		cxl_ops->ack_irq(ctx, CXL_PSL_TFC_An_A, 0);
+		return IRQ_HANDLED;
+	}
+	if (dsisr & CXL_PSL9_DSISR_An_OC)
+		pr_devel("CXL interrupt: OS Context Warning\n");
+
+	WARN(1, "Unhandled CXL PSL IRQ\n");
+	return IRQ_HANDLED;
+}
+
+irqreturn_t cxl_irq_psl8(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info)
 {
 	u64 dsisr, dar;
 
diff --git a/drivers/misc/cxl/main.c b/drivers/misc/cxl/main.c
index b0b6ed31918e..1703655072b1 100644
--- a/drivers/misc/cxl/main.c
+++ b/drivers/misc/cxl/main.c
@@ -59,16 +59,10 @@ int cxl_afu_slbia(struct cxl_afu *afu)
 
 static inline void _cxl_slbia(struct cxl_context *ctx, struct mm_struct *mm)
 {
-	struct task_struct *task;
 	unsigned long flags;
-	if (!(task = get_pid_task(ctx->pid, PIDTYPE_PID))) {
-		pr_devel("%s unable to get task %i\n",
-			 __func__, pid_nr(ctx->pid));
-		return;
-	}
 
-	if (task->mm != mm)
-		goto out_put;
+	if (ctx->mm != mm)
+		return;
 
 	pr_devel("%s matched mm - card: %i afu: %i pe: %i\n", __func__,
 		 ctx->afu->adapter->adapter_num, ctx->afu->slice, ctx->pe);
@@ -79,8 +73,6 @@ static inline void _cxl_slbia(struct cxl_context *ctx, struct mm_struct *mm)
 	spin_unlock_irqrestore(&ctx->sste_lock, flags);
 	mb();
 	cxl_afu_slbia(ctx->afu);
-out_put:
-	put_task_struct(task);
 }
 
 static inline void cxl_slbia_core(struct mm_struct *mm)
diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index 7ae710585267..871a2f09c718 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -95,12 +95,23 @@ int cxl_afu_disable(struct cxl_afu *afu)
 /* This will disable as well as reset */
 static int native_afu_reset(struct cxl_afu *afu)
 {
+	int rc;
+	u64 serr;
+
 	pr_devel("AFU reset request\n");
 
-	return afu_control(afu, CXL_AFU_Cntl_An_RA, 0,
+	rc = afu_control(afu, CXL_AFU_Cntl_An_RA, 0,
 			   CXL_AFU_Cntl_An_RS_Complete | CXL_AFU_Cntl_An_ES_Disabled,
 			   CXL_AFU_Cntl_An_RS_MASK | CXL_AFU_Cntl_An_ES_MASK,
 			   false);
+
+	/* Re-enable any masked interrupts */
+	serr = cxl_p1n_read(afu, CXL_PSL_SERR_An);
+	serr &= ~CXL_PSL_SERR_An_IRQ_MASKS;
+	cxl_p1n_write(afu, CXL_PSL_SERR_An, serr);
+
+
+	return rc;
 }
 
 static int native_afu_check_and_enable(struct cxl_afu *afu)
@@ -120,6 +131,7 @@ int cxl_psl_purge(struct cxl_afu *afu)
 	u64 AFU_Cntl = cxl_p2n_read(afu, CXL_AFU_Cntl_An);
 	u64 dsisr, dar;
 	u64 start, end;
+	u64 trans_fault = 0x0ULL;
 	unsigned long timeout = jiffies + (HZ * CXL_TIMEOUT);
 	int rc = 0;
 
@@ -127,6 +139,11 @@ int cxl_psl_purge(struct cxl_afu *afu)
 
 	pr_devel("PSL purge request\n");
 
+	if (cxl_is_psl8(afu))
+		trans_fault = CXL_PSL_DSISR_TRANS;
+	if (cxl_is_psl9(afu))
+		trans_fault = CXL_PSL9_DSISR_An_TF;
+
 	if (!cxl_ops->link_ok(afu->adapter, afu)) {
 		dev_warn(&afu->dev, "PSL Purge called with link down, ignoring\n");
 		rc = -EIO;
@@ -155,13 +172,17 @@ int cxl_psl_purge(struct cxl_afu *afu)
 		}
 
 		dsisr = cxl_p2n_read(afu, CXL_PSL_DSISR_An);
-		pr_devel_ratelimited("PSL purging... PSL_CNTL: 0x%016llx  PSL_DSISR: 0x%016llx\n", PSL_CNTL, dsisr);
-		if (dsisr & CXL_PSL_DSISR_TRANS) {
+		pr_devel_ratelimited("PSL purging... PSL_CNTL: 0x%016llx  PSL_DSISR: 0x%016llx\n",
+				     PSL_CNTL, dsisr);
+
+		if (dsisr & trans_fault) {
 			dar = cxl_p2n_read(afu, CXL_PSL_DAR_An);
-			dev_notice(&afu->dev, "PSL purge terminating pending translation, DSISR: 0x%016llx, DAR: 0x%016llx\n", dsisr, dar);
+			dev_notice(&afu->dev, "PSL purge terminating pending translation, DSISR: 0x%016llx, DAR: 0x%016llx\n",
+				   dsisr, dar);
 			cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_AE);
 		} else if (dsisr) {
-			dev_notice(&afu->dev, "PSL purge acknowledging pending non-translation fault, DSISR: 0x%016llx\n", dsisr);
+			dev_notice(&afu->dev, "PSL purge acknowledging pending non-translation fault, DSISR: 0x%016llx\n",
+				   dsisr);
 			cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_A);
 		} else {
 			cpu_relax();
@@ -196,7 +217,7 @@ static int spa_max_procs(int spa_size)
 	return ((spa_size / 8) - 96) / 17;
 }
 
-int cxl_alloc_spa(struct cxl_afu *afu)
+static int cxl_alloc_spa(struct cxl_afu *afu, int mode)
 {
 	unsigned spa_size;
 
@@ -209,7 +230,8 @@ int cxl_alloc_spa(struct cxl_afu *afu)
 		if (spa_size > 0x100000) {
 			dev_warn(&afu->dev, "num_of_processes too large for the SPA, limiting to %i (0x%x)\n",
 					afu->native->spa_max_procs, afu->native->spa_size);
-			afu->num_procs = afu->native->spa_max_procs;
+			if (mode != CXL_MODE_DEDICATED)
+				afu->num_procs = afu->native->spa_max_procs;
 			break;
 		}
 
@@ -258,7 +280,37 @@ void cxl_release_spa(struct cxl_afu *afu)
 	}
 }
 
-int cxl_tlb_slb_invalidate(struct cxl *adapter)
+/*
+ * Invalidation of all ERAT entries is no longer required by CAIA2. Use
+ * only for debug.
+ */
+int cxl_invalidate_all_psl9(struct cxl *adapter)
+{
+	unsigned long timeout = jiffies + (HZ * CXL_TIMEOUT);
+	u64 ierat;
+
+	pr_devel("CXL adapter - invalidation of all ERAT entries\n");
+
+	/* Invalidates all ERAT entries for Radix or HPT */
+	ierat = CXL_XSL9_IERAT_IALL;
+	if (radix_enabled())
+		ierat |= CXL_XSL9_IERAT_INVR;
+	cxl_p1_write(adapter, CXL_XSL9_IERAT, ierat);
+
+	while (cxl_p1_read(adapter, CXL_XSL9_IERAT) & CXL_XSL9_IERAT_IINPROG) {
+		if (time_after_eq(jiffies, timeout)) {
+			dev_warn(&adapter->dev,
+			"WARNING: CXL adapter invalidation of all ERAT entries timed out!\n");
+			return -EBUSY;
+		}
+		if (!cxl_ops->link_ok(adapter, NULL))
+			return -EIO;
+		cpu_relax();
+	}
+	return 0;
+}
+
+int cxl_invalidate_all_psl8(struct cxl *adapter)
 {
 	unsigned long timeout = jiffies + (HZ * CXL_TIMEOUT);
 
@@ -466,7 +518,8 @@ static int remove_process_element(struct cxl_context *ctx)
 
 	if (!rc)
 		ctx->pe_inserted = false;
-	slb_invalid(ctx);
+	if (cxl_is_power8())
+		slb_invalid(ctx);
 	pr_devel("%s Remove pe: %i finished\n", __func__, ctx->pe);
 	mutex_unlock(&ctx->afu->native->spa_mutex);
 
@@ -493,13 +546,14 @@ static int activate_afu_directed(struct cxl_afu *afu)
 
 	afu->num_procs = afu->max_procs_virtualised;
 	if (afu->native->spa == NULL) {
-		if (cxl_alloc_spa(afu))
+		if (cxl_alloc_spa(afu, CXL_MODE_DIRECTED))
 			return -ENOMEM;
 	}
 	attach_spa(afu);
 
 	cxl_p1n_write(afu, CXL_PSL_SCNTL_An, CXL_PSL_SCNTL_An_PM_AFU);
-	cxl_p1n_write(afu, CXL_PSL_AMOR_An, 0xFFFFFFFFFFFFFFFFULL);
+	if (cxl_is_power8())
+		cxl_p1n_write(afu, CXL_PSL_AMOR_An, 0xFFFFFFFFFFFFFFFFULL);
 	cxl_p1n_write(afu, CXL_PSL_ID_An, CXL_PSL_ID_An_F | CXL_PSL_ID_An_L);
 
 	afu->current_mode = CXL_MODE_DIRECTED;
@@ -542,10 +596,19 @@ static u64 calculate_sr(struct cxl_context *ctx)
 		sr |= (mfmsr() & MSR_SF) | CXL_PSL_SR_An_HV;
 	} else {
 		sr |= CXL_PSL_SR_An_PR | CXL_PSL_SR_An_R;
-		sr &= ~(CXL_PSL_SR_An_HV);
+		if (radix_enabled())
+			sr |= CXL_PSL_SR_An_HV;
+		else
+			sr &= ~(CXL_PSL_SR_An_HV);
 		if (!test_tsk_thread_flag(current, TIF_32BIT))
 			sr |= CXL_PSL_SR_An_SF;
 	}
+	if (cxl_is_psl9(ctx->afu)) {
+		if (radix_enabled())
+			sr |= CXL_PSL_SR_An_XLAT_ror;
+		else
+			sr |= CXL_PSL_SR_An_XLAT_hpt;
+	}
 	return sr;
 }
 
@@ -578,7 +641,71 @@ static void update_ivtes_directed(struct cxl_context *ctx)
 		WARN_ON(add_process_element(ctx));
 }
 
-static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr)
+static int process_element_entry_psl9(struct cxl_context *ctx, u64 wed, u64 amr)
+{
+	u32 pid;
+
+	cxl_assign_psn_space(ctx);
+
+	ctx->elem->ctxtime = 0; /* disable */
+	ctx->elem->lpid = cpu_to_be32(mfspr(SPRN_LPID));
+	ctx->elem->haurp = 0; /* disable */
+
+	if (ctx->kernel)
+		pid = 0;
+	else {
+		if (ctx->mm == NULL) {
+			pr_devel("%s: unable to get mm for pe=%d pid=%i\n",
+				__func__, ctx->pe, pid_nr(ctx->pid));
+			return -EINVAL;
+		}
+		pid = ctx->mm->context.id;
+	}
+
+	ctx->elem->common.tid = 0;
+	ctx->elem->common.pid = cpu_to_be32(pid);
+
+	ctx->elem->sr = cpu_to_be64(calculate_sr(ctx));
+
+	ctx->elem->common.csrp = 0; /* disable */
+
+	cxl_prefault(ctx, wed);
+
+	/*
+	 * Ensure we have the multiplexed PSL interrupt set up to take faults
+	 * for kernel contexts that may not have allocated any AFU IRQs at all:
+	 */
+	if (ctx->irqs.range[0] == 0) {
+		ctx->irqs.offset[0] = ctx->afu->native->psl_hwirq;
+		ctx->irqs.range[0] = 1;
+	}
+
+	ctx->elem->common.amr = cpu_to_be64(amr);
+	ctx->elem->common.wed = cpu_to_be64(wed);
+
+	return 0;
+}
+
+int cxl_attach_afu_directed_psl9(struct cxl_context *ctx, u64 wed, u64 amr)
+{
+	int result;
+
+	/* fill the process element entry */
+	result = process_element_entry_psl9(ctx, wed, amr);
+	if (result)
+		return result;
+
+	update_ivtes_directed(ctx);
+
+	/* first guy needs to enable */
+	result = cxl_ops->afu_check_and_enable(ctx->afu);
+	if (result)
+		return result;
+
+	return add_process_element(ctx);
+}
+
+int cxl_attach_afu_directed_psl8(struct cxl_context *ctx, u64 wed, u64 amr)
 {
 	u32 pid;
 	int result;
@@ -588,7 +715,7 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr)
 	ctx->elem->ctxtime = 0; /* disable */
 	ctx->elem->lpid = cpu_to_be32(mfspr(SPRN_LPID));
 	ctx->elem->haurp = 0; /* disable */
-	ctx->elem->sdr = cpu_to_be64(mfspr(SPRN_SDR1));
+	ctx->elem->u.sdr = cpu_to_be64(mfspr(SPRN_SDR1));
 
 	pid = current->pid;
 	if (ctx->kernel)
@@ -599,13 +726,13 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr)
 	ctx->elem->sr = cpu_to_be64(calculate_sr(ctx));
 
 	ctx->elem->common.csrp = 0; /* disable */
-	ctx->elem->common.aurp0 = 0; /* disable */
-	ctx->elem->common.aurp1 = 0; /* disable */
+	ctx->elem->common.u.psl8.aurp0 = 0; /* disable */
+	ctx->elem->common.u.psl8.aurp1 = 0; /* disable */
 
 	cxl_prefault(ctx, wed);
 
-	ctx->elem->common.sstp0 = cpu_to_be64(ctx->sstp0);
-	ctx->elem->common.sstp1 = cpu_to_be64(ctx->sstp1);
+	ctx->elem->common.u.psl8.sstp0 = cpu_to_be64(ctx->sstp0);
+	ctx->elem->common.u.psl8.sstp1 = cpu_to_be64(ctx->sstp1);
 
 	/*
 	 * Ensure we have the multiplexed PSL interrupt set up to take faults
@@ -671,7 +798,33 @@ static int deactivate_afu_directed(struct cxl_afu *afu)
 	return 0;
 }
 
-static int activate_dedicated_process(struct cxl_afu *afu)
+int cxl_activate_dedicated_process_psl9(struct cxl_afu *afu)
+{
+	dev_info(&afu->dev, "Activating dedicated process mode\n");
+
+	/*
+	 * If XSL is set to dedicated mode (Set in PSL_SCNTL reg), the
+	 * XSL and AFU are programmed to work with a single context.
+	 * The context information should be configured in the SPA area
+	 * index 0 (so PSL_SPAP must be configured before enabling the
+	 * AFU).
+	 */
+	afu->num_procs = 1;
+	if (afu->native->spa == NULL) {
+		if (cxl_alloc_spa(afu, CXL_MODE_DEDICATED))
+			return -ENOMEM;
+	}
+	attach_spa(afu);
+
+	cxl_p1n_write(afu, CXL_PSL_SCNTL_An, CXL_PSL_SCNTL_An_PM_Process);
+	cxl_p1n_write(afu, CXL_PSL_ID_An, CXL_PSL_ID_An_F | CXL_PSL_ID_An_L);
+
+	afu->current_mode = CXL_MODE_DEDICATED;
+
+	return cxl_chardev_d_afu_add(afu);
+}
+
+int cxl_activate_dedicated_process_psl8(struct cxl_afu *afu)
 {
 	dev_info(&afu->dev, "Activating dedicated process mode\n");
 
@@ -694,7 +847,17 @@ static int activate_dedicated_process(struct cxl_afu *afu)
 	return cxl_chardev_d_afu_add(afu);
 }
 
-static void update_ivtes_dedicated(struct cxl_context *ctx)
+void cxl_update_dedicated_ivtes_psl9(struct cxl_context *ctx)
+{
+	int r;
+
+	for (r = 0; r < CXL_IRQ_RANGES; r++) {
+		ctx->elem->ivte_offsets[r] = cpu_to_be16(ctx->irqs.offset[r]);
+		ctx->elem->ivte_ranges[r] = cpu_to_be16(ctx->irqs.range[r]);
+	}
+}
+
+void cxl_update_dedicated_ivtes_psl8(struct cxl_context *ctx)
 {
 	struct cxl_afu *afu = ctx->afu;
 
@@ -710,7 +873,27 @@ static void update_ivtes_dedicated(struct cxl_context *ctx)
 			((u64)ctx->irqs.range[3] & 0xffff));
 }
 
-static int attach_dedicated(struct cxl_context *ctx, u64 wed, u64 amr)
+int cxl_attach_dedicated_process_psl9(struct cxl_context *ctx, u64 wed, u64 amr)
+{
+	struct cxl_afu *afu = ctx->afu;
+	int result;
+
+	/* fill the process element entry */
+	result = process_element_entry_psl9(ctx, wed, amr);
+	if (result)
+		return result;
+
+	if (ctx->afu->adapter->native->sl_ops->update_dedicated_ivtes)
+		afu->adapter->native->sl_ops->update_dedicated_ivtes(ctx);
+
+	result = cxl_ops->afu_reset(afu);
+	if (result)
+		return result;
+
+	return afu_enable(afu);
+}
+
+int cxl_attach_dedicated_process_psl8(struct cxl_context *ctx, u64 wed, u64 amr)
 {
 	struct cxl_afu *afu = ctx->afu;
 	u64 pid;
@@ -728,7 +911,8 @@ static int attach_dedicated(struct cxl_context *ctx, u64 wed, u64 amr)
 
 	cxl_prefault(ctx, wed);
 
-	update_ivtes_dedicated(ctx);
+	if (ctx->afu->adapter->native->sl_ops->update_dedicated_ivtes)
+		afu->adapter->native->sl_ops->update_dedicated_ivtes(ctx);
 
 	cxl_p2n_write(afu, CXL_PSL_AMR_An, amr);
 
@@ -778,8 +962,9 @@ static int native_afu_activate_mode(struct cxl_afu *afu, int mode)
 
 	if (mode == CXL_MODE_DIRECTED)
 		return activate_afu_directed(afu);
-	if (mode == CXL_MODE_DEDICATED)
-		return activate_dedicated_process(afu);
+	if ((mode == CXL_MODE_DEDICATED) &&
+	    (afu->adapter->native->sl_ops->activate_dedicated_process))
+		return afu->adapter->native->sl_ops->activate_dedicated_process(afu);
 
 	return -EINVAL;
 }
@@ -793,11 +978,13 @@ static int native_attach_process(struct cxl_context *ctx, bool kernel,
 	}
 
 	ctx->kernel = kernel;
-	if (ctx->afu->current_mode == CXL_MODE_DIRECTED)
-		return attach_afu_directed(ctx, wed, amr);
+	if ((ctx->afu->current_mode == CXL_MODE_DIRECTED) &&
+	    (ctx->afu->adapter->native->sl_ops->attach_afu_directed))
+		return ctx->afu->adapter->native->sl_ops->attach_afu_directed(ctx, wed, amr);
 
-	if (ctx->afu->current_mode == CXL_MODE_DEDICATED)
-		return attach_dedicated(ctx, wed, amr);
+	if ((ctx->afu->current_mode == CXL_MODE_DEDICATED) &&
+	    (ctx->afu->adapter->native->sl_ops->attach_dedicated_process))
+		return ctx->afu->adapter->native->sl_ops->attach_dedicated_process(ctx, wed, amr);
 
 	return -EINVAL;
 }
@@ -830,8 +1017,9 @@ static void native_update_ivtes(struct cxl_context *ctx)
 {
 	if (ctx->afu->current_mode == CXL_MODE_DIRECTED)
 		return update_ivtes_directed(ctx);
-	if (ctx->afu->current_mode == CXL_MODE_DEDICATED)
-		return update_ivtes_dedicated(ctx);
+	if ((ctx->afu->current_mode == CXL_MODE_DEDICATED) &&
+	    (ctx->afu->adapter->native->sl_ops->update_dedicated_ivtes))
+		return ctx->afu->adapter->native->sl_ops->update_dedicated_ivtes(ctx);
 	WARN(1, "native_update_ivtes: Bad mode\n");
 }
 
@@ -859,8 +1047,6 @@ static int native_detach_process(struct cxl_context *ctx)
 
 static int native_get_irq_info(struct cxl_afu *afu, struct cxl_irq_info *info)
 {
-	u64 pidtid;
-
 	/* If the adapter has gone away, we can't get any meaningful
 	 * information.
 	 */
@@ -869,10 +1055,8 @@ static int native_get_irq_info(struct cxl_afu *afu, struct cxl_irq_info *info)
 
 	info->dsisr = cxl_p2n_read(afu, CXL_PSL_DSISR_An);
 	info->dar = cxl_p2n_read(afu, CXL_PSL_DAR_An);
-	info->dsr = cxl_p2n_read(afu, CXL_PSL_DSR_An);
-	pidtid = cxl_p2n_read(afu, CXL_PSL_PID_TID_An);
-	info->pid = pidtid >> 32;
-	info->tid = pidtid & 0xffffffff;
+	if (cxl_is_power8())
+		info->dsr = cxl_p2n_read(afu, CXL_PSL_DSR_An);
 	info->afu_err = cxl_p2n_read(afu, CXL_AFU_ERR_An);
 	info->errstat = cxl_p2n_read(afu, CXL_PSL_ErrStat_An);
 	info->proc_handle = 0;
@@ -880,7 +1064,22 @@ static int native_get_irq_info(struct cxl_afu *afu, struct cxl_irq_info *info)
 	return 0;
 }
 
-void cxl_native_psl_irq_dump_regs(struct cxl_context *ctx)
+void cxl_native_irq_dump_regs_psl9(struct cxl_context *ctx)
+{
+	u64 fir1, fir2, serr;
+
+	fir1 = cxl_p1_read(ctx->afu->adapter, CXL_PSL9_FIR1);
+	fir2 = cxl_p1_read(ctx->afu->adapter, CXL_PSL9_FIR2);
+
+	dev_crit(&ctx->afu->dev, "PSL_FIR1: 0x%016llx\n", fir1);
+	dev_crit(&ctx->afu->dev, "PSL_FIR2: 0x%016llx\n", fir2);
+	if (ctx->afu->adapter->native->sl_ops->register_serr_irq) {
+		serr = cxl_p1n_read(ctx->afu, CXL_PSL_SERR_An);
+		cxl_afu_decode_psl_serr(ctx->afu, serr);
+	}
+}
+
+void cxl_native_irq_dump_regs_psl8(struct cxl_context *ctx)
 {
 	u64 fir1, fir2, fir_slice, serr, afu_debug;
 
@@ -916,9 +1115,20 @@ static irqreturn_t native_handle_psl_slice_error(struct cxl_context *ctx,
 	return cxl_ops->ack_irq(ctx, 0, errstat);
 }
 
-static irqreturn_t fail_psl_irq(struct cxl_afu *afu, struct cxl_irq_info *irq_info)
+static bool cxl_is_translation_fault(struct cxl_afu *afu, u64 dsisr)
+{
+	if ((cxl_is_psl8(afu)) && (dsisr & CXL_PSL_DSISR_TRANS))
+		return true;
+
+	if ((cxl_is_psl9(afu)) && (dsisr & CXL_PSL9_DSISR_An_TF))
+		return true;
+
+	return false;
+}
+
+irqreturn_t cxl_fail_irq_psl(struct cxl_afu *afu, struct cxl_irq_info *irq_info)
 {
-	if (irq_info->dsisr & CXL_PSL_DSISR_TRANS)
+	if (cxl_is_translation_fault(afu, irq_info->dsisr))
 		cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_AE);
 	else
 		cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_A);
@@ -932,7 +1142,7 @@ static irqreturn_t native_irq_multiplexed(int irq, void *data)
 	struct cxl_context *ctx;
 	struct cxl_irq_info irq_info;
 	u64 phreg = cxl_p2n_read(afu, CXL_PSL_PEHandle_An);
-	int ph, ret;
+	int ph, ret = IRQ_HANDLED, res;
 
 	/* check if eeh kicked in while the interrupt was in flight */
 	if (unlikely(phreg == ~0ULL)) {
@@ -943,15 +1153,18 @@ static irqreturn_t native_irq_multiplexed(int irq, void *data)
 	}
 	/* Mask the pe-handle from register value */
 	ph = phreg & 0xffff;
-	if ((ret = native_get_irq_info(afu, &irq_info))) {
-		WARN(1, "Unable to get CXL IRQ Info: %i\n", ret);
-		return fail_psl_irq(afu, &irq_info);
+	if ((res = native_get_irq_info(afu, &irq_info))) {
+		WARN(1, "Unable to get CXL IRQ Info: %i\n", res);
+		if (afu->adapter->native->sl_ops->fail_irq)
+			return afu->adapter->native->sl_ops->fail_irq(afu, &irq_info);
+		return ret;
 	}
 
 	rcu_read_lock();
 	ctx = idr_find(&afu->contexts_idr, ph);
 	if (ctx) {
-		ret = cxl_irq(irq, ctx, &irq_info);
+		if (afu->adapter->native->sl_ops->handle_interrupt)
+			ret = afu->adapter->native->sl_ops->handle_interrupt(irq, ctx, &irq_info);
 		rcu_read_unlock();
 		return ret;
 	}
@@ -961,7 +1174,9 @@ static irqreturn_t native_irq_multiplexed(int irq, void *data)
 		" %016llx\n(Possible AFU HW issue - was a term/remove acked"
 		" with outstanding transactions?)\n", ph, irq_info.dsisr,
 		irq_info.dar);
-	return fail_psl_irq(afu, &irq_info);
+	if (afu->adapter->native->sl_ops->fail_irq)
+		ret = afu->adapter->native->sl_ops->fail_irq(afu, &irq_info);
+	return ret;
 }
 
 static void native_irq_wait(struct cxl_context *ctx)
@@ -979,7 +1194,11 @@ static void native_irq_wait(struct cxl_context *ctx)
 		if (ph != ctx->pe)
 			return;
 		dsisr = cxl_p2n_read(ctx->afu, CXL_PSL_DSISR_An);
-		if ((dsisr & CXL_PSL_DSISR_PENDING) == 0)
+		if (cxl_is_psl8(ctx->afu) &&
+		   ((dsisr & CXL_PSL_DSISR_PENDING) == 0))
+			return;
+		if (cxl_is_psl9(ctx->afu) &&
+		   ((dsisr & CXL_PSL9_DSISR_PENDING) == 0))
 			return;
 		/*
 		 * We are waiting for the workqueue to process our
@@ -996,25 +1215,33 @@ static void native_irq_wait(struct cxl_context *ctx)
 static irqreturn_t native_slice_irq_err(int irq, void *data)
 {
 	struct cxl_afu *afu = data;
-	u64 fir_slice, errstat, serr, afu_debug, afu_error, dsisr;
+	u64 errstat, serr, afu_error, dsisr;
+	u64 fir_slice, afu_debug, irq_mask;
 
 	/*
 	 * slice err interrupt is only used with full PSL (no XSL)
 	 */
 	serr = cxl_p1n_read(afu, CXL_PSL_SERR_An);
-	fir_slice = cxl_p1n_read(afu, CXL_PSL_FIR_SLICE_An);
 	errstat = cxl_p2n_read(afu, CXL_PSL_ErrStat_An);
-	afu_debug = cxl_p1n_read(afu, CXL_AFU_DEBUG_An);
 	afu_error = cxl_p2n_read(afu, CXL_AFU_ERR_An);
 	dsisr = cxl_p2n_read(afu, CXL_PSL_DSISR_An);
 	cxl_afu_decode_psl_serr(afu, serr);
-	dev_crit(&afu->dev, "PSL_FIR_SLICE_An: 0x%016llx\n", fir_slice);
+
+	if (cxl_is_power8()) {
+		fir_slice = cxl_p1n_read(afu, CXL_PSL_FIR_SLICE_An);
+		afu_debug = cxl_p1n_read(afu, CXL_AFU_DEBUG_An);
+		dev_crit(&afu->dev, "PSL_FIR_SLICE_An: 0x%016llx\n", fir_slice);
+		dev_crit(&afu->dev, "CXL_PSL_AFU_DEBUG_An: 0x%016llx\n", afu_debug);
+	}
 	dev_crit(&afu->dev, "CXL_PSL_ErrStat_An: 0x%016llx\n", errstat);
-	dev_crit(&afu->dev, "CXL_PSL_AFU_DEBUG_An: 0x%016llx\n", afu_debug);
 	dev_crit(&afu->dev, "AFU_ERR_An: 0x%.16llx\n", afu_error);
 	dev_crit(&afu->dev, "PSL_DSISR_An: 0x%.16llx\n", dsisr);
 
+	/* mask off the IRQ so it won't retrigger until the AFU is reset */
+	irq_mask = (serr & CXL_PSL_SERR_An_IRQS) >> 32;
+	serr |= irq_mask;
 	cxl_p1n_write(afu, CXL_PSL_SERR_An, serr);
+	dev_info(&afu->dev, "Further such interrupts will be masked until the AFU is reset\n");
 
 	return IRQ_HANDLED;
 }
@@ -1103,7 +1330,15 @@ int cxl_native_register_serr_irq(struct cxl_afu *afu)
 	}
 
 	serr = cxl_p1n_read(afu, CXL_PSL_SERR_An);
-	serr = (serr & 0x00ffffffffff0000ULL) | (afu->serr_hwirq & 0xffff);
+	if (cxl_is_power8())
+		serr = (serr & 0x00ffffffffff0000ULL) | (afu->serr_hwirq & 0xffff);
+	if (cxl_is_power9()) {
+		/*
+		 * By default, all errors are masked. So don't set all masks.
+		 * Slice errors will be transfered.
+		 */
+		serr = (serr & ~0xff0000007fffffffULL) | (afu->serr_hwirq & 0xffff);
+	}
 	cxl_p1n_write(afu, CXL_PSL_SERR_An, serr);
 
 	return 0;
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index b27ea98b781f..6dc1ee5b92c9 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -60,7 +60,7 @@
 #define CXL_VSEC_PROTOCOL_MASK   0xe0
 #define CXL_VSEC_PROTOCOL_1024TB 0x80
 #define CXL_VSEC_PROTOCOL_512TB  0x40
-#define CXL_VSEC_PROTOCOL_256TB  0x20 /* Power 8 uses this */
+#define CXL_VSEC_PROTOCOL_256TB  0x20 /* Power 8/9 uses this */
 #define CXL_VSEC_PROTOCOL_ENABLE 0x01
 
 #define CXL_READ_VSEC_PSL_REVISION(dev, vsec, dest) \
@@ -123,6 +123,8 @@ static const struct pci_device_id cxl_pci_tbl[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x044b), },
 	{ PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x04cf), },
 	{ PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x0601), },
+	{ PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x0623), },
+	{ PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x0628), },
 	{ PCI_DEVICE_CLASS(0x120000, ~0), },
 
 	{ }
@@ -324,38 +326,59 @@ static void dump_afu_descriptor(struct cxl_afu *afu)
 #undef show_reg
 }
 
-#define CAPP_UNIT0_ID 0xBA
-#define CAPP_UNIT1_ID 0XBE
+#define P8_CAPP_UNIT0_ID 0xBA
+#define P8_CAPP_UNIT1_ID 0XBE
+#define P9_CAPP_UNIT0_ID 0xC0
+#define P9_CAPP_UNIT1_ID 0xE0
 
-static u64 get_capp_unit_id(struct device_node *np)
+static int get_phb_index(struct device_node *np, u32 *phb_index)
 {
-	u32 phb_index;
+	if (of_property_read_u32(np, "ibm,phb-index", phb_index))
+		return -ENODEV;
+	return 0;
+}
 
+static u64 get_capp_unit_id(struct device_node *np, u32 phb_index)
+{
 	/*
-	 * For chips other than POWER8NVL, we only have CAPP 0,
-	 * irrespective of which PHB is used.
+	 * POWER 8:
+	 *  - For chips other than POWER8NVL, we only have CAPP 0,
+	 *    irrespective of which PHB is used.
+	 *  - For POWER8NVL, assume CAPP 0 is attached to PHB0 and
+	 *    CAPP 1 is attached to PHB1.
 	 */
-	if (!pvr_version_is(PVR_POWER8NVL))
-		return CAPP_UNIT0_ID;
+	if (cxl_is_power8()) {
+		if (!pvr_version_is(PVR_POWER8NVL))
+			return P8_CAPP_UNIT0_ID;
+
+		if (phb_index == 0)
+			return P8_CAPP_UNIT0_ID;
+
+		if (phb_index == 1)
+			return P8_CAPP_UNIT1_ID;
+	}
 
 	/*
-	 * For POWER8NVL, assume CAPP 0 is attached to PHB0 and
-	 * CAPP 1 is attached to PHB1.
+	 * POWER 9:
+	 *   PEC0 (PHB0). Capp ID = CAPP0 (0b1100_0000)
+	 *   PEC1 (PHB1 - PHB2). No capi mode
+	 *   PEC2 (PHB3 - PHB4 - PHB5): Capi mode on PHB3 only. Capp ID = CAPP1 (0b1110_0000)
 	 */
-	if (of_property_read_u32(np, "ibm,phb-index", &phb_index))
-		return 0;
+	if (cxl_is_power9()) {
+		if (phb_index == 0)
+			return P9_CAPP_UNIT0_ID;
 
-	if (phb_index == 0)
-		return CAPP_UNIT0_ID;
-
-	if (phb_index == 1)
-		return CAPP_UNIT1_ID;
+		if (phb_index == 3)
+			return P9_CAPP_UNIT1_ID;
+	}
 
 	return 0;
 }
 
-static int calc_capp_routing(struct pci_dev *dev, u64 *chipid, u64 *capp_unit_id)
+static int calc_capp_routing(struct pci_dev *dev, u64 *chipid,
+			     u32 *phb_index, u64 *capp_unit_id)
 {
+	int rc;
 	struct device_node *np;
 	const __be32 *prop;
 
@@ -366,8 +389,16 @@ static int calc_capp_routing(struct pci_dev *dev, u64 *chipid, u64 *capp_unit_id
 		np = of_get_next_parent(np);
 	if (!np)
 		return -ENODEV;
+
 	*chipid = be32_to_cpup(prop);
-	*capp_unit_id = get_capp_unit_id(np);
+
+	rc = get_phb_index(np, phb_index);
+	if (rc) {
+		pr_err("cxl: invalid phb index\n");
+		return rc;
+	}
+
+	*capp_unit_id = get_capp_unit_id(np, *phb_index);
 	of_node_put(np);
 	if (!*capp_unit_id) {
 		pr_err("cxl: invalid capp unit id\n");
@@ -377,14 +408,104 @@ static int calc_capp_routing(struct pci_dev *dev, u64 *chipid, u64 *capp_unit_id
 	return 0;
 }
 
-static int init_implementation_adapter_psl_regs(struct cxl *adapter, struct pci_dev *dev)
+static int init_implementation_adapter_regs_psl9(struct cxl *adapter, struct pci_dev *dev)
+{
+	u64 xsl_dsnctl, psl_fircntl;
+	u64 chipid;
+	u32 phb_index;
+	u64 capp_unit_id;
+	int rc;
+
+	rc = calc_capp_routing(dev, &chipid, &phb_index, &capp_unit_id);
+	if (rc)
+		return rc;
+
+	/*
+	 * CAPI Identifier bits [0:7]
+	 * bit 61:60 MSI bits --> 0
+	 * bit 59 TVT selector --> 0
+	 */
+
+	/*
+	 * Tell XSL where to route data to.
+	 * The field chipid should match the PHB CAPI_CMPM register
+	 */
+	xsl_dsnctl = ((u64)0x2 << (63-7)); /* Bit 57 */
+	xsl_dsnctl |= (capp_unit_id << (63-15));
+
+	/* nMMU_ID Defaults to: b’000001001’*/
+	xsl_dsnctl |= ((u64)0x09 << (63-28));
+
+	if (cxl_is_power9() && !cpu_has_feature(CPU_FTR_POWER9_DD1)) {
+		/*
+		 * Used to identify CAPI packets which should be sorted into
+		 * the Non-Blocking queues by the PHB. This field should match
+		 * the PHB PBL_NBW_CMPM register
+		 * nbwind=0x03, bits [57:58], must include capi indicator.
+		 * Not supported on P9 DD1.
+		 */
+		xsl_dsnctl |= ((u64)0x03 << (63-47));
+
+		/*
+		 * Upper 16b address bits of ASB_Notify messages sent to the
+		 * system. Need to match the PHB’s ASN Compare/Mask Register.
+		 * Not supported on P9 DD1.
+		 */
+		xsl_dsnctl |= ((u64)0x04 << (63-55));
+	}
+
+	cxl_p1_write(adapter, CXL_XSL9_DSNCTL, xsl_dsnctl);
+
+	/* Set fir_cntl to recommended value for production env */
+	psl_fircntl = (0x2ULL << (63-3)); /* ce_report */
+	psl_fircntl |= (0x1ULL << (63-6)); /* FIR_report */
+	psl_fircntl |= 0x1ULL; /* ce_thresh */
+	cxl_p1_write(adapter, CXL_PSL9_FIR_CNTL, psl_fircntl);
+
+	/* vccredits=0x1  pcklat=0x4 */
+	cxl_p1_write(adapter, CXL_PSL9_DSNDCTL, 0x0000000000001810ULL);
+
+	/*
+	 * For debugging with trace arrays.
+	 * Configure RX trace 0 segmented mode.
+	 * Configure CT trace 0 segmented mode.
+	 * Configure LA0 trace 0 segmented mode.
+	 * Configure LA1 trace 0 segmented mode.
+	 */
+	cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8040800080000000ULL);
+	cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8040800080000003ULL);
+	cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8040800080000005ULL);
+	cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8040800080000006ULL);
+
+	/*
+	 * A response to an ASB_Notify request is returned by the
+	 * system as an MMIO write to the address defined in
+	 * the PSL_TNR_ADDR register
+	 */
+	/* PSL_TNR_ADDR */
+
+	/* NORST */
+	cxl_p1_write(adapter, CXL_PSL9_DEBUG, 0x8000000000000000ULL);
+
+	/* allocate the apc machines */
+	cxl_p1_write(adapter, CXL_PSL9_APCDEDTYPE, 0x40000003FFFF0000ULL);
+
+	/* Disable vc dd1 fix */
+	if ((cxl_is_power9() && cpu_has_feature(CPU_FTR_POWER9_DD1)))
+		cxl_p1_write(adapter, CXL_PSL9_GP_CT, 0x0400000000000001ULL);
+
+	return 0;
+}
+
+static int init_implementation_adapter_regs_psl8(struct cxl *adapter, struct pci_dev *dev)
 {
 	u64 psl_dsnctl, psl_fircntl;
 	u64 chipid;
+	u32 phb_index;
 	u64 capp_unit_id;
 	int rc;
 
-	rc = calc_capp_routing(dev, &chipid, &capp_unit_id);
+	rc = calc_capp_routing(dev, &chipid, &phb_index, &capp_unit_id);
 	if (rc)
 		return rc;
 
@@ -409,14 +530,15 @@ static int init_implementation_adapter_psl_regs(struct cxl *adapter, struct pci_
 	return 0;
 }
 
-static int init_implementation_adapter_xsl_regs(struct cxl *adapter, struct pci_dev *dev)
+static int init_implementation_adapter_regs_xsl(struct cxl *adapter, struct pci_dev *dev)
 {
 	u64 xsl_dsnctl;
 	u64 chipid;
+	u32 phb_index;
 	u64 capp_unit_id;
 	int rc;
 
-	rc = calc_capp_routing(dev, &chipid, &capp_unit_id);
+	rc = calc_capp_routing(dev, &chipid, &phb_index, &capp_unit_id);
 	if (rc)
 		return rc;
 
@@ -434,7 +556,13 @@ static int init_implementation_adapter_xsl_regs(struct cxl *adapter, struct pci_
 /* For the PSL this is a multiple for 0 < n <= 7: */
 #define PSL_2048_250MHZ_CYCLES 1
 
-static void write_timebase_ctrl_psl(struct cxl *adapter)
+static void write_timebase_ctrl_psl9(struct cxl *adapter)
+{
+	cxl_p1_write(adapter, CXL_PSL9_TB_CTLSTAT,
+		     TBSYNC_CNT(2 * PSL_2048_250MHZ_CYCLES));
+}
+
+static void write_timebase_ctrl_psl8(struct cxl *adapter)
 {
 	cxl_p1_write(adapter, CXL_PSL_TB_CTLSTAT,
 		     TBSYNC_CNT(2 * PSL_2048_250MHZ_CYCLES));
@@ -455,7 +583,12 @@ static void write_timebase_ctrl_xsl(struct cxl *adapter)
 		     TBSYNC_CNT(XSL_4000_CLOCKS));
 }
 
-static u64 timebase_read_psl(struct cxl *adapter)
+static u64 timebase_read_psl9(struct cxl *adapter)
+{
+	return cxl_p1_read(adapter, CXL_PSL9_Timebase);
+}
+
+static u64 timebase_read_psl8(struct cxl *adapter)
 {
 	return cxl_p1_read(adapter, CXL_PSL_Timebase);
 }
@@ -513,7 +646,12 @@ static void cxl_setup_psl_timebase(struct cxl *adapter, struct pci_dev *dev)
 	return;
 }
 
-static int init_implementation_afu_psl_regs(struct cxl_afu *afu)
+static int init_implementation_afu_regs_psl9(struct cxl_afu *afu)
+{
+	return 0;
+}
+
+static int init_implementation_afu_regs_psl8(struct cxl_afu *afu)
 {
 	/* read/write masks for this slice */
 	cxl_p1n_write(afu, CXL_PSL_APCALLOC_A, 0xFFFFFFFEFEFEFEFEULL);
@@ -611,7 +749,7 @@ static int setup_cxl_bars(struct pci_dev *dev)
 	/*
 	 * BAR 4/5 has a special meaning for CXL and must be programmed with a
 	 * special value corresponding to the CXL protocol address range.
-	 * For POWER 8 that means bits 48:49 must be set to 10
+	 * For POWER 8/9 that means bits 48:49 must be set to 10
 	 */
 	pci_write_config_dword(dev, PCI_BASE_ADDRESS_4, 0x00000000);
 	pci_write_config_dword(dev, PCI_BASE_ADDRESS_5, 0x00020000);
@@ -968,7 +1106,7 @@ static int cxl_afu_descriptor_looks_ok(struct cxl_afu *afu)
 	}
 
 	if (afu->pp_psa && (afu->pp_size < PAGE_SIZE))
-		dev_warn(&afu->dev, "AFU uses < PAGE_SIZE per-process PSA!");
+		dev_warn(&afu->dev, "AFU uses pp_size(%#016llx) < PAGE_SIZE per-process PSA!\n", afu->pp_size);
 
 	for (i = 0; i < afu->crs_num; i++) {
 		rc = cxl_ops->afu_cr_read32(afu, i, 0, &val);
@@ -996,7 +1134,53 @@ static int cxl_afu_descriptor_looks_ok(struct cxl_afu *afu)
 	return 0;
 }
 
-static int sanitise_afu_regs(struct cxl_afu *afu)
+static int sanitise_afu_regs_psl9(struct cxl_afu *afu)
+{
+	u64 reg;
+
+	/*
+	 * Clear out any regs that contain either an IVTE or address or may be
+	 * waiting on an acknowledgment to try to be a bit safer as we bring
+	 * it online
+	 */
+	reg = cxl_p2n_read(afu, CXL_AFU_Cntl_An);
+	if ((reg & CXL_AFU_Cntl_An_ES_MASK) != CXL_AFU_Cntl_An_ES_Disabled) {
+		dev_warn(&afu->dev, "WARNING: AFU was not disabled: %#016llx\n", reg);
+		if (cxl_ops->afu_reset(afu))
+			return -EIO;
+		if (cxl_afu_disable(afu))
+			return -EIO;
+		if (cxl_psl_purge(afu))
+			return -EIO;
+	}
+	cxl_p1n_write(afu, CXL_PSL_SPAP_An, 0x0000000000000000);
+	cxl_p1n_write(afu, CXL_PSL_AMBAR_An, 0x0000000000000000);
+	reg = cxl_p2n_read(afu, CXL_PSL_DSISR_An);
+	if (reg) {
+		dev_warn(&afu->dev, "AFU had pending DSISR: %#016llx\n", reg);
+		if (reg & CXL_PSL9_DSISR_An_TF)
+			cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_AE);
+		else
+			cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_A);
+	}
+	if (afu->adapter->native->sl_ops->register_serr_irq) {
+		reg = cxl_p1n_read(afu, CXL_PSL_SERR_An);
+		if (reg) {
+			if (reg & ~0x000000007fffffff)
+				dev_warn(&afu->dev, "AFU had pending SERR: %#016llx\n", reg);
+			cxl_p1n_write(afu, CXL_PSL_SERR_An, reg & ~0xffff);
+		}
+	}
+	reg = cxl_p2n_read(afu, CXL_PSL_ErrStat_An);
+	if (reg) {
+		dev_warn(&afu->dev, "AFU had pending error status: %#016llx\n", reg);
+		cxl_p2n_write(afu, CXL_PSL_ErrStat_An, reg);
+	}
+
+	return 0;
+}
+
+static int sanitise_afu_regs_psl8(struct cxl_afu *afu)
 {
 	u64 reg;
 
@@ -1102,8 +1286,11 @@ static int pci_configure_afu(struct cxl_afu *afu, struct cxl *adapter, struct pc
 	if ((rc = pci_map_slice_regs(afu, adapter, dev)))
 		return rc;
 
-	if ((rc = sanitise_afu_regs(afu)))
-		goto err1;
+	if (adapter->native->sl_ops->sanitise_afu_regs) {
+		rc = adapter->native->sl_ops->sanitise_afu_regs(afu);
+		if (rc)
+			goto err1;
+	}
 
 	/* We need to reset the AFU before we can read the AFU descriptor */
 	if ((rc = cxl_ops->afu_reset(afu)))
@@ -1248,8 +1435,13 @@ int cxl_pci_reset(struct cxl *adapter)
 
 	dev_info(&dev->dev, "CXL reset\n");
 
-	/* the adapter is about to be reset, so ignore errors */
-	cxl_data_cache_flush(adapter);
+	/*
+	 * The adapter is about to be reset, so ignore errors.
+	 * Not supported on P9 DD1
+	 */
+	if ((cxl_is_power8()) ||
+	    ((cxl_is_power9() && !cpu_has_feature(CPU_FTR_POWER9_DD1))))
+		cxl_data_cache_flush(adapter);
 
 	/* pcie_warm_reset requests a fundamental pci reset which includes a
 	 * PERST assert/deassert.  PERST triggers a loading of the image
@@ -1332,6 +1524,7 @@ static int cxl_read_vsec(struct cxl *adapter, struct pci_dev *dev)
 	CXL_READ_VSEC_IMAGE_STATE(dev, vsec, &image_state);
 	adapter->user_image_loaded = !!(image_state & CXL_VSEC_USER_IMAGE_LOADED);
 	adapter->perst_select_user = !!(image_state & CXL_VSEC_USER_IMAGE_LOADED);
+	adapter->perst_loads_image = !!(image_state & CXL_VSEC_PERST_LOADS_IMAGE);
 
 	CXL_READ_VSEC_NAFUS(dev, vsec, &adapter->slices);
 	CXL_READ_VSEC_AFU_DESC_OFF(dev, vsec, &afu_desc_off);
@@ -1378,6 +1571,17 @@ static void cxl_fixup_malformed_tlp(struct cxl *adapter, struct pci_dev *dev)
 	pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, data);
 }
 
+static bool cxl_compatible_caia_version(struct cxl *adapter)
+{
+	if (cxl_is_power8() && (adapter->caia_major == 1))
+		return true;
+
+	if (cxl_is_power9() && (adapter->caia_major == 2))
+		return true;
+
+	return false;
+}
+
 static int cxl_vsec_looks_ok(struct cxl *adapter, struct pci_dev *dev)
 {
 	if (adapter->vsec_status & CXL_STATUS_SECOND_PORT)
@@ -1388,6 +1592,12 @@ static int cxl_vsec_looks_ok(struct cxl *adapter, struct pci_dev *dev)
 		return -EINVAL;
 	}
 
+	if (!cxl_compatible_caia_version(adapter)) {
+		dev_info(&dev->dev, "Ignoring card. PSL type is not supported (caia version: %d)\n",
+			 adapter->caia_major);
+		return -ENODEV;
+	}
+
 	if (!adapter->slices) {
 		/* Once we support dynamic reprogramming we can use the card if
 		 * it supports loadable AFUs */
@@ -1431,9 +1641,19 @@ static void cxl_release_adapter(struct device *dev)
 
 static int sanitise_adapter_regs(struct cxl *adapter)
 {
+	int rc = 0;
+
 	/* Clear PSL tberror bit by writing 1 to it */
 	cxl_p1_write(adapter, CXL_PSL_ErrIVTE, CXL_PSL_ErrIVTE_tberror);
-	return cxl_tlb_slb_invalidate(adapter);
+
+	if (adapter->native->sl_ops->invalidate_all) {
+		/* do not invalidate ERAT entries when not reloading on PERST */
+		if (cxl_is_power9() && (adapter->perst_loads_image))
+			return 0;
+		rc = adapter->native->sl_ops->invalidate_all(adapter);
+	}
+
+	return rc;
 }
 
 /* This should contain *only* operations that can safely be done in
@@ -1496,8 +1716,6 @@ static int cxl_configure_adapter(struct cxl *adapter, struct pci_dev *dev)
 	if ((rc = cxl_native_register_psl_err_irq(adapter)))
 		goto err;
 
-	/* Release the context lock as adapter is configured */
-	cxl_adapter_context_unlock(adapter);
 	return 0;
 
 err:
@@ -1516,25 +1734,65 @@ static void cxl_deconfigure_adapter(struct cxl *adapter)
 	pci_disable_device(pdev);
 }
 
-static const struct cxl_service_layer_ops psl_ops = {
-	.adapter_regs_init = init_implementation_adapter_psl_regs,
-	.afu_regs_init = init_implementation_afu_psl_regs,
+static const struct cxl_service_layer_ops psl9_ops = {
+	.adapter_regs_init = init_implementation_adapter_regs_psl9,
+	.invalidate_all = cxl_invalidate_all_psl9,
+	.afu_regs_init = init_implementation_afu_regs_psl9,
+	.sanitise_afu_regs = sanitise_afu_regs_psl9,
 	.register_serr_irq = cxl_native_register_serr_irq,
 	.release_serr_irq = cxl_native_release_serr_irq,
-	.debugfs_add_adapter_sl_regs = cxl_debugfs_add_adapter_psl_regs,
-	.debugfs_add_afu_sl_regs = cxl_debugfs_add_afu_psl_regs,
-	.psl_irq_dump_registers = cxl_native_psl_irq_dump_regs,
+	.handle_interrupt = cxl_irq_psl9,
+	.fail_irq = cxl_fail_irq_psl,
+	.activate_dedicated_process = cxl_activate_dedicated_process_psl9,
+	.attach_afu_directed = cxl_attach_afu_directed_psl9,
+	.attach_dedicated_process = cxl_attach_dedicated_process_psl9,
+	.update_dedicated_ivtes = cxl_update_dedicated_ivtes_psl9,
+	.debugfs_add_adapter_regs = cxl_debugfs_add_adapter_regs_psl9,
+	.debugfs_add_afu_regs = cxl_debugfs_add_afu_regs_psl9,
+	.psl_irq_dump_registers = cxl_native_irq_dump_regs_psl9,
 	.err_irq_dump_registers = cxl_native_err_irq_dump_regs,
-	.debugfs_stop_trace = cxl_stop_trace,
-	.write_timebase_ctrl = write_timebase_ctrl_psl,
-	.timebase_read = timebase_read_psl,
+	.debugfs_stop_trace = cxl_stop_trace_psl9,
+	.write_timebase_ctrl = write_timebase_ctrl_psl9,
+	.timebase_read = timebase_read_psl9,
+	.capi_mode = OPAL_PHB_CAPI_MODE_CAPI,
+	.needs_reset_before_disable = true,
+};
+
+static const struct cxl_service_layer_ops psl8_ops = {
+	.adapter_regs_init = init_implementation_adapter_regs_psl8,
+	.invalidate_all = cxl_invalidate_all_psl8,
+	.afu_regs_init = init_implementation_afu_regs_psl8,
+	.sanitise_afu_regs = sanitise_afu_regs_psl8,
+	.register_serr_irq = cxl_native_register_serr_irq,
+	.release_serr_irq = cxl_native_release_serr_irq,
+	.handle_interrupt = cxl_irq_psl8,
+	.fail_irq = cxl_fail_irq_psl,
+	.activate_dedicated_process = cxl_activate_dedicated_process_psl8,
+	.attach_afu_directed = cxl_attach_afu_directed_psl8,
+	.attach_dedicated_process = cxl_attach_dedicated_process_psl8,
+	.update_dedicated_ivtes = cxl_update_dedicated_ivtes_psl8,
+	.debugfs_add_adapter_regs = cxl_debugfs_add_adapter_regs_psl8,
+	.debugfs_add_afu_regs = cxl_debugfs_add_afu_regs_psl8,
+	.psl_irq_dump_registers = cxl_native_irq_dump_regs_psl8,
+	.err_irq_dump_registers = cxl_native_err_irq_dump_regs,
+	.debugfs_stop_trace = cxl_stop_trace_psl8,
+	.write_timebase_ctrl = write_timebase_ctrl_psl8,
+	.timebase_read = timebase_read_psl8,
 	.capi_mode = OPAL_PHB_CAPI_MODE_CAPI,
 	.needs_reset_before_disable = true,
 };
 
 static const struct cxl_service_layer_ops xsl_ops = {
-	.adapter_regs_init = init_implementation_adapter_xsl_regs,
-	.debugfs_add_adapter_sl_regs = cxl_debugfs_add_adapter_xsl_regs,
+	.adapter_regs_init = init_implementation_adapter_regs_xsl,
+	.invalidate_all = cxl_invalidate_all_psl8,
+	.sanitise_afu_regs = sanitise_afu_regs_psl8,
+	.handle_interrupt = cxl_irq_psl8,
+	.fail_irq = cxl_fail_irq_psl,
+	.activate_dedicated_process = cxl_activate_dedicated_process_psl8,
+	.attach_afu_directed = cxl_attach_afu_directed_psl8,
+	.attach_dedicated_process = cxl_attach_dedicated_process_psl8,
+	.update_dedicated_ivtes = cxl_update_dedicated_ivtes_psl8,
+	.debugfs_add_adapter_regs = cxl_debugfs_add_adapter_regs_xsl,
 	.write_timebase_ctrl = write_timebase_ctrl_xsl,
 	.timebase_read = timebase_read_xsl,
 	.capi_mode = OPAL_PHB_CAPI_MODE_DMA,
@@ -1548,8 +1806,13 @@ static void set_sl_ops(struct cxl *adapter, struct pci_dev *dev)
 		adapter->native->sl_ops = &xsl_ops;
 		adapter->min_pe = 1; /* Workaround for CX-4 hardware bug */
 	} else {
-		dev_info(&dev->dev, "Device uses a PSL\n");
-		adapter->native->sl_ops = &psl_ops;
+		if (cxl_is_power8()) {
+			dev_info(&dev->dev, "Device uses a PSL8\n");
+			adapter->native->sl_ops = &psl8_ops;
+		} else {
+			dev_info(&dev->dev, "Device uses a PSL9\n");
+			adapter->native->sl_ops = &psl9_ops;
+		}
 	}
 }
 
@@ -1596,6 +1859,9 @@ static struct cxl *cxl_pci_init_adapter(struct pci_dev *dev)
 	if ((rc = cxl_sysfs_adapter_add(adapter)))
 		goto err_put1;
 
+	/* Release the context lock as adapter is configured */
+	cxl_adapter_context_unlock(adapter);
+
 	return adapter;
 
 err_put1:
@@ -1619,8 +1885,13 @@ static void cxl_pci_remove_adapter(struct cxl *adapter)
 	cxl_sysfs_adapter_remove(adapter);
 	cxl_debugfs_adapter_remove(adapter);
 
-	/* Flush adapter datacache as its about to be removed */
-	cxl_data_cache_flush(adapter);
+	/*
+	 * Flush adapter datacache as its about to be removed.
+	 * Not supported on P9 DD1.
+	 */
+	if ((cxl_is_power8()) ||
+	    ((cxl_is_power9() && !cpu_has_feature(CPU_FTR_POWER9_DD1))))
+		cxl_data_cache_flush(adapter);
 
 	cxl_deconfigure_adapter(adapter);
 
@@ -1704,6 +1975,11 @@ static int cxl_probe(struct pci_dev *dev, const struct pci_device_id *id)
 		return -ENODEV;
 	}
 
+	if (cxl_is_power9() && !radix_enabled()) {
+		dev_info(&dev->dev, "Only Radix mode supported\n");
+		return -ENODEV;
+	}
+
 	if (cxl_verbose)
 		dump_cxl_config_space(dev);
 
@@ -1781,7 +2057,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
 {
 	struct cxl *adapter = pci_get_drvdata(pdev);
 	struct cxl_afu *afu;
-	pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET;
+	pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET, afu_result;
 	int i;
 
 	/* At this point, we could still have an interrupt pending.
@@ -1885,16 +2161,26 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
 	for (i = 0; i < adapter->slices; i++) {
 		afu = adapter->afu[i];
 
-		result = cxl_vphb_error_detected(afu, state);
-
-		/* Only continue if everyone agrees on NEED_RESET */
-		if (result != PCI_ERS_RESULT_NEED_RESET)
-			return result;
+		afu_result = cxl_vphb_error_detected(afu, state);
 
 		cxl_context_detach_all(afu);
 		cxl_ops->afu_deactivate_mode(afu, afu->current_mode);
 		pci_deconfigure_afu(afu);
+
+		/* Disconnect trumps all, NONE trumps NEED_RESET */
+		if (afu_result == PCI_ERS_RESULT_DISCONNECT)
+			result = PCI_ERS_RESULT_DISCONNECT;
+		else if ((afu_result == PCI_ERS_RESULT_NONE) &&
+			 (result == PCI_ERS_RESULT_NEED_RESET))
+			result = PCI_ERS_RESULT_NONE;
 	}
+
+	/* should take the context lock here */
+	if (cxl_adapter_context_lock(adapter) != 0)
+		dev_warn(&adapter->dev,
+			 "Couldn't take context lock with %d active-contexts\n",
+			 atomic_read(&adapter->contexts_num));
+
 	cxl_deconfigure_adapter(adapter);
 
 	return result;
@@ -1913,6 +2199,13 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
 	if (cxl_configure_adapter(adapter, pdev))
 		goto err;
 
+	/*
+	 * Unlock context activation for the adapter. Ideally this should be
+	 * done in cxl_pci_resume but cxlflash module tries to activate the
+	 * master context as part of slot_reset callback.
+	 */
+	cxl_adapter_context_unlock(adapter);
+
 	for (i = 0; i < adapter->slices; i++) {
 		afu = adapter->afu[i];
 
diff --git a/drivers/misc/cxl/trace.h b/drivers/misc/cxl/trace.h
index 751d6119683e..b8e300af0e55 100644
--- a/drivers/misc/cxl/trace.h
+++ b/drivers/misc/cxl/trace.h
@@ -17,6 +17,15 @@
 
 #include "cxl.h"
 
+#define dsisr_psl9_flags(flags) \
+	__print_flags(flags, "|", \
+		{ CXL_PSL9_DSISR_An_CO_MASK,	"FR" }, \
+		{ CXL_PSL9_DSISR_An_TF,		"TF" }, \
+		{ CXL_PSL9_DSISR_An_PE,		"PE" }, \
+		{ CXL_PSL9_DSISR_An_AE,		"AE" }, \
+		{ CXL_PSL9_DSISR_An_OC,		"OC" }, \
+		{ CXL_PSL9_DSISR_An_S,		"S" })
+
 #define DSISR_FLAGS \
 	{ CXL_PSL_DSISR_An_DS,	"DS" }, \
 	{ CXL_PSL_DSISR_An_DM,	"DM" }, \
@@ -154,6 +163,40 @@ TRACE_EVENT(cxl_afu_irq,
 	)
 );
 
+TRACE_EVENT(cxl_psl9_irq,
+	TP_PROTO(struct cxl_context *ctx, int irq, u64 dsisr, u64 dar),
+
+	TP_ARGS(ctx, irq, dsisr, dar),
+
+	TP_STRUCT__entry(
+		__field(u8, card)
+		__field(u8, afu)
+		__field(u16, pe)
+		__field(int, irq)
+		__field(u64, dsisr)
+		__field(u64, dar)
+	),
+
+	TP_fast_assign(
+		__entry->card = ctx->afu->adapter->adapter_num;
+		__entry->afu = ctx->afu->slice;
+		__entry->pe = ctx->pe;
+		__entry->irq = irq;
+		__entry->dsisr = dsisr;
+		__entry->dar = dar;
+	),
+
+	TP_printk("afu%i.%i pe=%i irq=%i dsisr=0x%016llx dsisr=%s dar=0x%016llx",
+		__entry->card,
+		__entry->afu,
+		__entry->pe,
+		__entry->irq,
+		__entry->dsisr,
+		dsisr_psl9_flags(__entry->dsisr),
+		__entry->dar
+	)
+);
+
 TRACE_EVENT(cxl_psl_irq,
 	TP_PROTO(struct cxl_context *ctx, int irq, u64 dsisr, u64 dar),
 
diff --git a/drivers/of/base.c b/drivers/of/base.c
index d7c4629a3a2d..0ea16bd3c8f1 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -1213,6 +1213,37 @@ int of_property_read_u32_index(const struct device_node *np,
 EXPORT_SYMBOL_GPL(of_property_read_u32_index);
 
 /**
+ * of_property_read_u64_index - Find and read a u64 from a multi-value property.
+ *
+ * @np:		device node from which the property value is to be read.
+ * @propname:	name of the property to be searched.
+ * @index:	index of the u64 in the list of values
+ * @out_value:	pointer to return value, modified only if no error.
+ *
+ * Search for a property in a device node and read nth 64-bit value from
+ * it. Returns 0 on success, -EINVAL if the property does not exist,
+ * -ENODATA if property does not have a value, and -EOVERFLOW if the
+ * property data isn't large enough.
+ *
+ * The out_value is modified only if a valid u64 value can be decoded.
+ */
+int of_property_read_u64_index(const struct device_node *np,
+				       const char *propname,
+				       u32 index, u64 *out_value)
+{
+	const u64 *val = of_find_property_value_of_size(np, propname,
+					((index + 1) * sizeof(*out_value)),
+					0, NULL);
+
+	if (IS_ERR(val))
+		return PTR_ERR(val);
+
+	*out_value = be64_to_cpup(((__be64 *)val) + index);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(of_property_read_u64_index);
+
+/**
  * of_property_read_variable_u8_array - Find and read an array of u8 from a
  * property, with bounds on the minimum and maximum array size.
  *
diff --git a/drivers/pcmcia/electra_cf.c b/drivers/pcmcia/electra_cf.c
index 4d7bc3f4124a..c6fe2a4a7a6a 100644
--- a/drivers/pcmcia/electra_cf.c
+++ b/drivers/pcmcia/electra_cf.c
@@ -207,7 +207,7 @@ static int electra_cf_probe(struct platform_device *ofdev)
 		return -ENOMEM;
 
 	setup_timer(&cf->timer, electra_cf_timer, (unsigned long)cf);
-	cf->irq = NO_IRQ;
+	cf->irq = 0;
 
 	cf->ofdev = ofdev;
 	cf->mem_phys = mem.start;
@@ -313,7 +313,7 @@ fail3:
 fail2:
 	release_mem_region(cf->mem_phys, cf->mem_size);
 fail1:
-	if (cf->irq != NO_IRQ)
+	if (cf->irq)
 		free_irq(cf->irq, cf);
 
 	if (cf->io_virt)
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 78dca1aa6410..63112c36ab2d 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -685,7 +685,7 @@ static void tce_iommu_free_table(struct tce_container *container,
 	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
 
 	tce_iommu_userspace_view_free(tbl, container->mm);
-	tbl->it_ops->free(tbl);
+	iommu_tce_table_put(tbl);
 	decrement_locked_vm(container->mm, pages);
 }
 
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 47e4da5b4fa2..30f90c1a0aaf 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -381,6 +381,7 @@ static inline struct kprobe_ctlblk *get_kprobe_ctlblk(void)
 	return this_cpu_ptr(&kprobe_ctlblk);
 }
 
+kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset);
 int register_kprobe(struct kprobe *p);
 void unregister_kprobe(struct kprobe *p);
 int register_kprobes(struct kprobe **kps, int num);
diff --git a/include/linux/of.h b/include/linux/of.h
index e5d4225fda35..50fcdb54087f 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -294,6 +294,9 @@ extern int of_property_count_elems_of_size(const struct device_node *np,
 extern int of_property_read_u32_index(const struct device_node *np,
 				       const char *propname,
 				       u32 index, u32 *out_value);
+extern int of_property_read_u64_index(const struct device_node *np,
+				       const char *propname,
+				       u32 index, u64 *out_value);
 extern int of_property_read_variable_u8_array(const struct device_node *np,
 					const char *propname, u8 *out_values,
 					size_t sz_min, size_t sz_max);
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index d09a9cd021b1..b1c0b187acfe 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -922,6 +922,7 @@ enum perf_callchain_context {
 #define PERF_FLAG_PID_CGROUP		(1UL << 2) /* pid=cgroup id, per-cpu mode only */
 #define PERF_FLAG_FD_CLOEXEC		(1UL << 3) /* O_CLOEXEC */
 
+#if defined(__LITTLE_ENDIAN_BITFIELD)
 union perf_mem_data_src {
 	__u64 val;
 	struct {
@@ -933,6 +934,21 @@ union perf_mem_data_src {
 			mem_rsvd:31;
 	};
 };
+#elif defined(__BIG_ENDIAN_BITFIELD)
+union perf_mem_data_src {
+	__u64 val;
+	struct {
+		__u64	mem_rsvd:31,
+			mem_dtlb:7,	/* tlb access */
+			mem_lock:2,	/* lock instr */
+			mem_snoop:5,	/* snoop mode */
+			mem_lvl:14,	/* memory hierarchy level */
+			mem_op:5;	/* type of opcode */
+	};
+};
+#else
+#error "Unknown endianness"
+#endif
 
 /* type of opcode (load/store/prefetch,code) */
 #define PERF_MEM_OP_NA		0x01 /* not available */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d733479a10ee..7367e0ec6f81 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -58,15 +58,6 @@
 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
 
 
-/*
- * Some oddball architectures like 64bit powerpc have function descriptors
- * so this must be overridable.
- */
-#ifndef kprobe_lookup_name
-#define kprobe_lookup_name(name, addr) \
-	addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name)))
-#endif
-
 static int kprobes_initialized;
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
@@ -81,6 +72,12 @@ static struct {
 	raw_spinlock_t lock ____cacheline_aligned_in_smp;
 } kretprobe_table_locks[KPROBE_TABLE_SIZE];
 
+kprobe_opcode_t * __weak kprobe_lookup_name(const char *name,
+					unsigned int __unused)
+{
+	return ((kprobe_opcode_t *)(kallsyms_lookup_name(name)));
+}
+
 static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
 {
 	return &(kretprobe_table_locks[hash].lock);
@@ -746,13 +743,20 @@ static void kill_optimized_kprobe(struct kprobe *p)
 	arch_remove_optimized_kprobe(op);
 }
 
+static inline
+void __prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
+{
+	if (!kprobe_ftrace(p))
+		arch_prepare_optimized_kprobe(op, p);
+}
+
 /* Try to prepare optimized instructions */
 static void prepare_optimized_kprobe(struct kprobe *p)
 {
 	struct optimized_kprobe *op;
 
 	op = container_of(p, struct optimized_kprobe, kp);
-	arch_prepare_optimized_kprobe(op, p);
+	__prepare_optimized_kprobe(op, p);
 }
 
 /* Allocate new optimized_kprobe and try to prepare optimized instructions */
@@ -766,7 +770,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
 
 	INIT_LIST_HEAD(&op->list);
 	op->kp.addr = p->addr;
-	arch_prepare_optimized_kprobe(op, p);
+	__prepare_optimized_kprobe(op, p);
 
 	return &op->kp;
 }
@@ -1398,7 +1402,7 @@ static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr,
 		goto invalid;
 
 	if (symbol_name) {
-		kprobe_lookup_name(symbol_name, addr);
+		addr = kprobe_lookup_name(symbol_name, offset);
 		if (!addr)
 			return ERR_PTR(-ENOENT);
 	}
@@ -2218,8 +2222,8 @@ static int __init init_kprobes(void)
 	if (kretprobe_blacklist_size) {
 		/* lookup the function address from its name */
 		for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
-			kprobe_lookup_name(kretprobe_blacklist[i].name,
-					   kretprobe_blacklist[i].addr);
+			kretprobe_blacklist[i].addr =
+				kprobe_lookup_name(kretprobe_blacklist[i].name, 0);
 			if (!kretprobe_blacklist[i].addr)
 				printk("kretprobe: lookup failed: %s\n",
 				       kretprobe_blacklist[i].name);
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index d09a9cd021b1..b1c0b187acfe 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -922,6 +922,7 @@ enum perf_callchain_context {
 #define PERF_FLAG_PID_CGROUP		(1UL << 2) /* pid=cgroup id, per-cpu mode only */
 #define PERF_FLAG_FD_CLOEXEC		(1UL << 3) /* O_CLOEXEC */
 
+#if defined(__LITTLE_ENDIAN_BITFIELD)
 union perf_mem_data_src {
 	__u64 val;
 	struct {
@@ -933,6 +934,21 @@ union perf_mem_data_src {
 			mem_rsvd:31;
 	};
 };
+#elif defined(__BIG_ENDIAN_BITFIELD)
+union perf_mem_data_src {
+	__u64 val;
+	struct {
+		__u64	mem_rsvd:31,
+			mem_dtlb:7,	/* tlb access */
+			mem_lock:2,	/* lock instr */
+			mem_snoop:5,	/* snoop mode */
+			mem_lvl:14,	/* memory hierarchy level */
+			mem_op:5;	/* type of opcode */
+	};
+};
+#else
+#error "Unknown endianness"
+#endif
 
 /* type of opcode (load/store/prefetch,code) */
 #define PERF_MEM_OP_NA		0x01 /* not available */
diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile
index 2132ff8eb4e7..03e1617367d3 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -14,6 +14,7 @@ export CFLAGS
 
 SUB_DIRS = alignment		\
 	   benchmarks		\
+	   cache_shape		\
 	   copyloops		\
 	   context_switch	\
 	   dscr			\
diff --git a/tools/testing/selftests/powerpc/cache_shape/.gitignore b/tools/testing/selftests/powerpc/cache_shape/.gitignore
new file mode 100644
index 000000000000..ec1848434be5
--- /dev/null
+++ b/tools/testing/selftests/powerpc/cache_shape/.gitignore
@@ -0,0 +1 @@
+cache_shape
diff --git a/tools/testing/selftests/powerpc/cache_shape/Makefile b/tools/testing/selftests/powerpc/cache_shape/Makefile
new file mode 100644
index 000000000000..b24485ab30e2
--- /dev/null
+++ b/tools/testing/selftests/powerpc/cache_shape/Makefile
@@ -0,0 +1,10 @@
+TEST_PROGS := cache_shape
+
+all: $(TEST_PROGS)
+
+$(TEST_PROGS): ../harness.c ../utils.c
+
+include ../../lib.mk
+
+clean:
+	rm -f $(TEST_PROGS) *.o
diff --git a/tools/testing/selftests/powerpc/cache_shape/cache_shape.c b/tools/testing/selftests/powerpc/cache_shape/cache_shape.c
new file mode 100644
index 000000000000..29ec07eba7f9
--- /dev/null
+++ b/tools/testing/selftests/powerpc/cache_shape/cache_shape.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright 2017, Michael Ellerman, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <elf.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <link.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "utils.h"
+
+#ifndef AT_L1I_CACHESIZE
+#define AT_L1I_CACHESIZE	40
+#define AT_L1I_CACHEGEOMETRY	41
+#define AT_L1D_CACHESIZE	42
+#define AT_L1D_CACHEGEOMETRY	43
+#define AT_L2_CACHESIZE		44
+#define AT_L2_CACHEGEOMETRY	45
+#define AT_L3_CACHESIZE		46
+#define AT_L3_CACHEGEOMETRY	47
+#endif
+
+static void print_size(const char *label, uint32_t val)
+{
+	printf("%s cache size: %#10x %10dB %10dK\n", label, val, val, val / 1024);
+}
+
+static void print_geo(const char *label, uint32_t val)
+{
+	uint16_t assoc;
+
+	printf("%s line size:  %#10x       ", label, val & 0xFFFF);
+
+	assoc = val >> 16;
+	if (assoc)
+		printf("%u-way", assoc);
+	else
+		printf("fully");
+
+	printf(" associative\n");
+}
+
+static int test_cache_shape()
+{
+	static char buffer[4096];
+	ElfW(auxv_t) *p;
+	int found;
+
+	FAIL_IF(read_auxv(buffer, sizeof(buffer)));
+
+	found = 0;
+
+	p = find_auxv_entry(AT_L1I_CACHESIZE, buffer);
+	if (p) {
+		found++;
+		print_size("L1I ", (uint32_t)p->a_un.a_val);
+	}
+
+	p = find_auxv_entry(AT_L1I_CACHEGEOMETRY, buffer);
+	if (p) {
+		found++;
+		print_geo("L1I ", (uint32_t)p->a_un.a_val);
+	}
+
+	p = find_auxv_entry(AT_L1D_CACHESIZE, buffer);
+	if (p) {
+		found++;
+		print_size("L1D ", (uint32_t)p->a_un.a_val);
+	}
+
+	p = find_auxv_entry(AT_L1D_CACHEGEOMETRY, buffer);
+	if (p) {
+		found++;
+		print_geo("L1D ", (uint32_t)p->a_un.a_val);
+	}
+
+	p = find_auxv_entry(AT_L2_CACHESIZE, buffer);
+	if (p) {
+		found++;
+		print_size("L2  ", (uint32_t)p->a_un.a_val);
+	}
+
+	p = find_auxv_entry(AT_L2_CACHEGEOMETRY, buffer);
+	if (p) {
+		found++;
+		print_geo("L2  ", (uint32_t)p->a_un.a_val);
+	}
+
+	p = find_auxv_entry(AT_L3_CACHESIZE, buffer);
+	if (p) {
+		found++;
+		print_size("L3  ", (uint32_t)p->a_un.a_val);
+	}
+
+	p = find_auxv_entry(AT_L3_CACHEGEOMETRY, buffer);
+	if (p) {
+		found++;
+		print_geo("L3  ", (uint32_t)p->a_un.a_val);
+	}
+
+	/* If we found none we're probably on a system where they don't exist */
+	SKIP_IF(found == 0);
+
+	/* But if we found any, we expect to find them all */
+	FAIL_IF(found != 8);
+
+	return 0;
+}
+
+int main(void)
+{
+	return test_harness(test_cache_shape, "cache_shape");
+}
diff --git a/tools/testing/selftests/powerpc/include/utils.h b/tools/testing/selftests/powerpc/include/utils.h
index 53405e8a52ab..735815b3ad7f 100644
--- a/tools/testing/selftests/powerpc/include/utils.h
+++ b/tools/testing/selftests/powerpc/include/utils.h
@@ -24,7 +24,11 @@ typedef uint8_t u8;
 
 void test_harness_set_timeout(uint64_t time);
 int test_harness(int (test_function)(void), char *name);
-extern void *get_auxv_entry(int type);
+
+int read_auxv(char *buf, ssize_t buf_size);
+void *find_auxv_entry(int type, char *auxv);
+void *get_auxv_entry(int type);
+
 int pick_online_cpu(void);
 
 static inline bool have_hwcap(unsigned long ftr)
diff --git a/tools/testing/selftests/powerpc/utils.c b/tools/testing/selftests/powerpc/utils.c
index dcf74184bfd0..d46916867a6f 100644
--- a/tools/testing/selftests/powerpc/utils.c
+++ b/tools/testing/selftests/powerpc/utils.c
@@ -19,45 +19,64 @@
 
 static char auxv[4096];
 
-void *get_auxv_entry(int type)
+int read_auxv(char *buf, ssize_t buf_size)
 {
-	ElfW(auxv_t) *p;
-	void *result;
 	ssize_t num;
-	int fd;
+	int rc, fd;
 
 	fd = open("/proc/self/auxv", O_RDONLY);
 	if (fd == -1) {
 		perror("open");
-		return NULL;
+		return -errno;
 	}
 
-	result = NULL;
-
-	num = read(fd, auxv, sizeof(auxv));
+	num = read(fd, buf, buf_size);
 	if (num < 0) {
 		perror("read");
+		rc = -EIO;
 		goto out;
 	}
 
-	if (num > sizeof(auxv)) {
-		printf("Overflowed auxv buffer\n");
+	if (num > buf_size) {
+		printf("overflowed auxv buffer\n");
+		rc = -EOVERFLOW;
 		goto out;
 	}
 
+	rc = 0;
+out:
+	close(fd);
+	return rc;
+}
+
+void *find_auxv_entry(int type, char *auxv)
+{
+	ElfW(auxv_t) *p;
+
 	p = (ElfW(auxv_t) *)auxv;
 
 	while (p->a_type != AT_NULL) {
-		if (p->a_type == type) {
-			result = (void *)p->a_un.a_val;
-			break;
-		}
+		if (p->a_type == type)
+			return p;
 
 		p++;
 	}
-out:
-	close(fd);
-	return result;
+
+	return NULL;
+}
+
+void *get_auxv_entry(int type)
+{
+	ElfW(auxv_t) *p;
+
+	if (read_auxv(auxv, sizeof(auxv)))
+		return NULL;
+
+	p = find_auxv_entry(type, auxv);
+	if (p)
+		return (void *)p->a_un.a_val;
+
+	return NULL;
 }
 
 int pick_online_cpu(void)
author	Linus Torvalds <torvalds@linux-foundation.org>	2017-05-05 11:36:44 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2017-05-05 11:36:44 -0700
commit	7246f60068840847bdcf595be5f0b5ca632736e0 (patch)
tree	fd9a963a03c2655f3ba9d1ced3c87a2775f5b166
parent	e579dde654fc2c6b0d3e4b77a9a4b2d2405c510e (diff)
parent	700b7eadd5625d22b8235fb21259b3d7d564c000 (diff)
download	lwn-7246f60068840847bdcf595be5f0b5ca632736e0.tar.gz lwn-7246f60068840847bdcf595be5f0b5ca632736e0.zip